Skip to content

Text Recognition [ex801.0]

Demonstrates 2-D ALSSM-based text recognition using an NDCompositeCost over a pixel image.

A reference letter image is used as a template. The 2-D ALSSM filter projects each local image patch onto a separable polynomial basis (one per image dimension) and computes the Log-Cost Ratio (LCR) between the best-fit model and a flat (constant) background model. Peaks in the resulting LCR map indicate the presence of the reference character.

Authors: Christof Baeriswyl, Frédéric Waldmann

Plot

Plot

Plot

Code

"""
Text Recognition [ex801.0]
--------------------------

Demonstrates 2-D ALSSM-based text recognition using an
[`NDCompositeCost`][lmlib.statespace.cost.NDCompositeCost] over a pixel image.

A reference letter image is used as a template.  The 2-D ALSSM filter
projects each local image patch onto a separable polynomial basis (one
per image dimension) and computes the Log-Cost Ratio (LCR) between the
best-fit model and a flat (constant) background model.  Peaks in the
resulting LCR map indicate the presence of the reference character.

Authors: Christof Baeriswyl, Frédéric Waldmann
"""
import numpy as np
import matplotlib.pyplot as plt
import lmlib as lm
import copy
import matplotlib
import os

try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

npy_file_letters = os.path.join(SCRIPT_DIR, "image_letters.npy")
npy_file_letters_noise = os.path.join(SCRIPT_DIR, "image_letters_noise.npy")


def generate_letter_images():
    """Generate the clean and noisy synthetic letter images.

    Returns
    -------
    Y_nonoise : np.ndarray
        Clean image (normalized alpha channel of the rendered letter grid).
    Y : np.ndarray
        Noisy image (row-wise Gaussian noise added to ``Y_nonoise``).

    Notes
    -----
    Requires Pillow (PIL).  A fixed random seed is used so that the
    generated image (and therefore the reference position ``K1_REF`` /
    ``K2_REF`` below) is reproducible across runs.
    """
    from PIL import Image, ImageDraw, ImageFont
    import random

    # Fixed seeds -> reproducible image and reference position.
    random.seed(1)
    np.random.seed(1)

    # Pillow >= 9.1 moved resampling constants into Image.Resampling.
    try:
        RESAMPLE = Image.Resampling.BICUBIC
    except AttributeError:
        RESAMPLE = Image.BICUBIC

    alphabet = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

    def load_font(size=80):
        """Load a usable TrueType font, trying common system paths."""
        candidates = [
            "/System/Library/Fonts/Courier.ttc",                       # macOS
            "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",         # Debian/Ubuntu
            "/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf",
            "C:\\Windows\\Fonts\\cour.ttf",                            # Windows
            "C:\\Windows\\Fonts\\arial.ttf",
        ]
        for path in candidates:
            try:
                return ImageFont.truetype(path, size, encoding="unic")
            except (OSError, IOError):
                continue
        # Fallback (note: the default bitmap font does not honor `size`).
        return ImageFont.load_default()

    # Create a reference / interferer image with a given letter
    def create_letter_image(letter):
        img = Image.new('RGBA', (100, 100), color=(255, 255, 255, 0))
        draw = ImageDraw.Draw(img)
        font = load_font(80)
        position = (10, 0)
        draw.text(position, letter, font=font, fill=(0, 0, 0, 255), align='center')
        return img

    # Create the synthetic image
    def create_synthetic_image_randomletters(grid_size=(8, 4), cell_size=(100, 100)):
        grid_image = Image.new(
            'RGBA',
            (grid_size[0] * cell_size[0], grid_size[1] * cell_size[1]),
            color=(255, 255, 255, 0),
        )
        # refletter = 'Z'
        refletter = 'A'
        for i in range(grid_size[0]):
            for j in range(grid_size[1]):
                # Randomly rotate and stretch the reference image
                angle = random.uniform(-5, 5)   # random rotation (deg)
                scale = random.uniform(1.0, 1.3)  # random scaling
                xpos = random.uniform(-30, 30)  # random position
                ypos = random.uniform(-30, 30)  # random position
                letter = random.choice(alphabet)

                if i == 0 and j == 0:
                    angle = 0
                    scale = 1
                    xpos = 0
                    ypos = 0  # fix for reference image
                    letter = refletter
                if (i == 5 and j == 1) or (i == 1 and j == 2) or (i == 3 and j == 3):
                    letter = refletter
                if i == 6 and j == 0:
                    letter = refletter
                    angle = -10
                    scale = 0.95  # 1.05

                image = create_letter_image(letter)
                transformed_image = image.rotate(angle, resample=RESAMPLE, expand=True)
                transformed_image = transformed_image.resize(
                    (int(transformed_image.width * scale), int(transformed_image.height * scale)),
                    RESAMPLE,
                )

                # Calculate position to paste the transformed image into the grid
                pos_x = int(xpos) + i * cell_size[0] + (cell_size[0] - transformed_image.width) // 2
                pos_y = int(ypos) + j * cell_size[1] + (cell_size[1] - transformed_image.height) // 2

                grid_image.alpha_composite(transformed_image, (pos_x, pos_y))

        return grid_image

    celllength = 120
    synthetic_image = create_synthetic_image_randomletters(cell_size=(celllength, celllength))
    Y_nonoise = np.array(synthetic_image)[:, :, 3] * 1.0
    Y_nonoise = Y_nonoise / np.nanmax(Y_nonoise)

    # Gaussian noise (row-wise): noise level increases with row index
    Y_image_gaussian = np.copy(Y_nonoise)
    rows, cols = Y_image_gaussian.shape[:2]
    for row in range(rows):
        std_dev = (row / rows) * 1.0 * 0.125
        noise = np.random.normal(0.0, std_dev, cols)
        Y_image_gaussian[row] = np.clip(Y_image_gaussian[row] + noise, 0, 1)
    Y = Y_image_gaussian.copy()

    return Y_nonoise, Y


# Generate the images on first run and cache them to disk; load them otherwise.
if os.path.exists(npy_file_letters) and os.path.exists(npy_file_letters_noise):
    Y_nonoise = np.load(npy_file_letters)
    Y = np.load(npy_file_letters_noise)
else:
    Y_nonoise, Y = generate_letter_images()
    np.save(npy_file_letters, Y_nonoise)
    np.save(npy_file_letters_noise, Y)

K1_REF = 55  # letter pixel position, x
K2_REF = 47  # letter pixel position, y

K1 = Y_nonoise.shape[0]
K2 = Y_nonoise.shape[1]
k1 = np.arange(K1)
k2 = np.arange(K2)

# ALSSM Definition
g = 100
l_side = 35
poly_degree = 3
alssm_poly_legendre_left = lm.AlssmPolyLegendre(poly_degree=poly_degree,a_seg=-l_side,b_seg=-1)
alssm_poly_legendre_right = lm.AlssmPolyLegendre(poly_degree=poly_degree,a_seg=0,b_seg=l_side)
segment_left = lm.Segment(a=-l_side, b=-1, direction=lm.FW, g=g)
segment_right = lm.Segment(a=0, b=l_side, direction=lm.BW, g=g)
F = [[1, 0],  # mixing matrix, turning on and off models per segment (1=on, 0=off)
     [0, 1]]

# filter signal 
cost_d1 = lm.CompositeCost([alssm_poly_legendre_left, alssm_poly_legendre_right], [segment_left, segment_right], F)
cost_d2  = lm.CompositeCost([alssm_poly_legendre_left, alssm_poly_legendre_right], [segment_left, segment_right], F)
nd_cost = lm.NDCompositeCost([cost_d1, cost_d2])
nd_rls = lm.RLSAlssm(nd_cost, steady_state=True, backend='lfilter')
nd_rls.filter(Y)

xs_H1 = nd_rls.minimize_x()
xs_ref = xs_H1[K1_REF, K2_REF]  # store state variables as reference pulse shape
J_B = nd_rls.eval_errors(xs_H1)

N = nd_cost.get_alssm_order()

H_A = np.zeros((N, 1))
H_A[0, 0] = 1 #allow fitting of coefficient 0 (offset)

h_A = xs_ref.copy() #template
h_A[0] = 0 #set offset of template to 0 (will be estimated by minimize_x)
xs_H2 = nd_rls.minimize_x(H_A, h_A)
J_A = nd_rls.eval_errors(xs_H2)  # get SE (squared error) for hypothesis 1
cr = J_B / J_A

# ------------ Plotting -------------------------------
plot_costratio = True
if plot_costratio:
    # Convert grayscale image to RGB by stacking it three times along the last dimension
    image_rgb = np.stack([1 - Y] * 3, axis=-1)
    crdisplay = copy.copy(cr)
    crdisplayalpha = cr

    cvals = [0, 0.5, 0.60, 0.65, 1]
    colors = ["white", "yellow", "orange", "red", "red"]

    norm = plt.Normalize(min(cvals), max(cvals))
    tuples = list(zip(map(norm, cvals), colors))
    cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", tuples)

    colored_overlay = cmap(crdisplay)
    colored_overlay_rgb = colored_overlay[...,
                          :3] * 1.0  # We only need the RGB channels, so discard the alpha channel for now

    alpha = crdisplayalpha[..., np.newaxis]
    alpha = alpha / np.nanmax(alpha)  # normalize to 1
    highlighted_image = (1 - alpha) * image_rgb + alpha * colored_overlay_rgb

    # Convert back to float for display
    highlighted_image = np.clip(highlighted_image, 0.0, 1.0)

    # Plot the original and highlighted images
    figsize = (7.8 * 0.7, 4.9 * 0.7)
    fig = plt.figure(layout='constrained', figsize=figsize, dpi=72)
    axs = fig.subplots(1, 1)
    cset = axs.imshow(highlighted_image)

    # Get the colormap colors
    alphas_fullrange = np.linspace(0, 1, num=cmap.N)  # sigmoid(np.linspace(0,1,num=cmap.N))
    alphas_fullrange = np.clip(alphas_fullrange, 0.5, 1)
    colors = cmap(np.arange(cmap.N))
    colors[:, -1] = alphas_fullrange
    cmap_with_alpha = matplotlib.colors.ListedColormap(colors)
    cmap_test = matplotlib.colors.ListedColormap(cmap(np.arange(cmap.N)))

    norm = plt.Normalize(0, 1)  # Set the normalization from -1 to 0
    fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap_with_alpha), ax=axs, location='bottom', aspect=40)

    # plot reference area
    col_reference = 'xkcd:blue'
    k1boxmin = K2_REF - l_side
    k1boxmax = K2_REF + l_side
    k2boxmin = K1_REF - l_side
    k2boxmax = K1_REF + l_side
    axs.plot([k1boxmin, k1boxmin, k1boxmax, k1boxmax, k1boxmin], [k2boxmin, k2boxmax, k2boxmax, k2boxmin, k2boxmin],
             c=col_reference, lw=1, ls='--')
    plt.show()

plot_ref = True
if plot_ref:
    mappedtraj = lm.Trajectory.eval_y(nd_cost, xs_ref, (K1_REF,K2_REF), (K1,K2))

    width, height = 40, 40 # image cut-outsize

    figsize = (6.5, 4.8)
    fig = plt.figure(layout='constrained', figsize=figsize, dpi=72)
    ax = fig.add_subplot(121)
    cset = ax.imshow(Y, cmap='gray_r')
    csetlcr = ax.imshow(mappedtraj, cmap='hot', alpha=0.5)
    ax.axis((K2_REF - height, K2_REF + height, K1_REF + width, K1_REF - width))

    # 3D plot
    ax = fig.add_subplot(122, projection='3d')
    k1k1_, k2k2_ = np.meshgrid(range(K1_REF - width, K1_REF + width), range(K2_REF - height, K2_REF + height), indexing='ij')
    ax.plot_surface(k1k1_, k2k2_, Y[k1k1_, k2k2_], cmap='gray_r',alpha=0.6,zorder=1)

    ax.plot_surface(k1k1_, k2k2_, mappedtraj[k1k1_, k2k2_], cmap='plasma',alpha=0.7,zorder=3)
    ax.plot_wireframe(k1k1_, k2k2_, mappedtraj[k1k1_, k2k2_], colors='b',zorder=3)

    ax.set_xlabel("Y axis") #note the swapped axis
    ax.set_ylabel("X axis")
    ax.view_init(azim=-1, elev=46)  # Change azimuth (horizontal angle) and elevation (vertical angle)
    ax.set_zlabel("Intensity")