Source code for formhtr.libs.processing.process_area

from Bio import pairwise2
import numpy as np



[docs]
def is_a_number(string):
    """Polish given string and check if can be converted to float.

    Args:
        string (str): given string possibly containing a float

    Returns:
        str: polished string
    """
    string = string.replace(" ", "")
    string = string.replace(",", ".")
    try:
        float(string)
    except ValueError:
        return None
    return string




[docs]
def identify_number(values):
    """If some number were found, give them priority

    Otherwise we will handle the region is general text area

    Args:
        values (list): list of identified values

    Returns:
        str: identified number (as string for compatibility)
    """
    float_strings = [is_a_number(value) for value in values]
    filtered_items = list(filter(lambda item: item is not None, float_strings))
    if len(filtered_items) != 0:
        return max(set(filtered_items), key=filtered_items.count)




[docs]
def separate_to_lines(rectangles):
    """Split set of rectangles into lines.

    This is determined by center of rectangle being inside of previous rectangle bounds.

    Args:
        rectangles (list): given list of rectangles

    Returns:
        list of list: list of rectangles grouped to lines
    """
    rectangles.sort(key=lambda rectangle: rectangle.center_y)
    average_height = np.mean([rectangle.height for rectangle in rectangles])
    line_break_threshold = average_height * 0.5

    # Step 3: Group coordinates into lines
    lines = []
    current_line = []
    previous_y = rectangles[0].center_y

    for rectangle in rectangles:
        if abs(rectangle.center_y - previous_y) > line_break_threshold:
            # A line break is detected
            lines.append(current_line)
            current_line = []
        
        current_line.append(rectangle)
        previous_y = rectangle.center_y  # Update previous_y to the bottom of the current word

    # Don't forget to add the last line if it's not empty
    if current_line:
        lines.append(current_line)

    return lines




[docs]
def get_max_dimensions(candidates):
    """Identify dimensions of identified text
    used to determine its size

    Args:
        candidates (list): identified lines of rectangles

    Returns:
        tuple: pair of values
    """
    max_words = 0
    max_lines = 0

    for candidate in candidates.values():
        max_lines = max(max_lines, len(candidate))
        for line in candidate:
            max_words = max(max_words, len(line))

    return max_words, max_lines



def construct_lines(lines):
    return '\n'.join([' '.join([rectangle.content for rectangle in line]) for line in lines])



[docs]
def align_pairwise(string_1, string_2):
    """Align two strings

    Args:
        string_1 (str): first string
        string_2 (str): second string

    Returns:
        str: aligned string
    """
    alignments = pairwise2.align.globalxs(string_1, string_2, -3, -1, gap_char=' ')
    return alignments[0][0]




[docs]
def majority_vote(strings):
    """Vote on individual positions of identified words

    Args:
        strings (list): list of words (per service) corresponding to a line

    Returns:
        list: most probable list of words
    """
    # Pad strings to the same length
    max_length = max(len(s) for s in strings)
    padded_strings = [s.ljust(max_length) for s in strings]

    # Compute the majority-voted string
    result = []
    for chars in zip(*padded_strings):
        # Count occurrences of each character
        count = {}
        for char in chars:
            count[char] = count.get(char, 0) + 1
        
        # Get the character with maximum occurrence
        voted_char = max(count, key=count.get)
        result.append(voted_char)

    return ''.join(result)




[docs]
def remove_non_ascii(string):
    """Remove non-ascii characters (these do not work in the alignment)

    Args:
        string (str): input line

    Returns:
        str: line containing only ascii characters
    """
    return ''.join(char for char in string if ord(char) < 128)




[docs]
def identify_words(lines, is_number):
    """Identify words from lines.
    Behaves differently based on how many lines there are.

    Args:
        lines (list): given list of lines as strings
        is_number (bool): True if number(s) is/are expected

    Returns:
        str: identified word
    """
    # curate lines
    lines = list(map(remove_non_ascii, lines))
    lines = list(filter(None, lines))

    if len(lines) == 1:
        return lines[0]
    elif len(lines) == 2:
        values = [align_pairwise(lines[0], lines[1]), 
                  align_pairwise(lines[1], lines[0])]
        if is_number:
            number = identify_number(values)
            if number is not None:
                return number
        return majority_vote(values)
    elif len(lines) == 3:
        values = []
        for i in range(len(lines)):
            this = lines[i]
            other1 = lines[(i+1)%3]
            other2 = lines[(i+2)%3]

            align1 = align_pairwise(this, other1)
            align2 = align_pairwise(this, other2)

            result = align_pairwise(align1, align2)
            values.append(result)
        if is_number:
            number = identify_number(values)
            if number is not None:
                return number
        return majority_vote(values)

    


[docs]
def filter_exceeding_words(lines, roi):
    """Filter regions exceeding bounds of ROI

    There are three cases:
    1. None of the regions exceeds the bounds
    2. Some of them
    3. All of them

    We keep everything as is in cases 1. and 3.,
    in case 2. we filter out the exceeding ones
    (as at least one of the services thinks the 
    exceeding part does not belong here)

    Args:
        lines (list): given list of lines
        roi (ROI): respective ROI

    Returns:
        list: filtered lines
    """
    indicators = []
    reduced_lines = []
    for line in lines:
        reduced_line = []
        exceeding_indicator = False
        for rectangle in line:
            rectangle_exceeding = roi.exceeding_rectangle(rectangle)
            exceeding_indicator = exceeding_indicator or rectangle_exceeding
            if not rectangle_exceeding:
                reduced_line.append(rectangle)
        indicators.append(exceeding_indicator)
        reduced_lines.append(reduced_line)

    if not (all(indicators) or not any(indicators)):
        return reduced_lines
    return lines




[docs]
def process_lines(lines, roi, is_number):
    """Join lines to words let majority voting decide

    Args:
        lines (list): lists of rectangles organised in lines
        roi (ROI): given ROI
        is_number (bool): True if number(s) is/are expected
    """
    lines = filter_exceeding_words(lines, roi)
    lines_of_words = [[rectangle.content for rectangle in line] for line in lines]
    lines_of_words = filter(None, lines_of_words)
    return identify_words([' '.join(line) for line in lines_of_words], is_number)




[docs]
def align_lines(candidate_lines):
    """Group lines to categories by y-coordinate

    Also sort them by y-coordinate to ensure correct order.

    Args:
        candidate_lines (list): identified lines from all services

    Returns:
        list: lines grouped by y-coordinate
    """
    groups = dict()
    for lines in candidate_lines:
        for line in lines:
            center = np.mean([rectangle.center_y for rectangle in line])
            bottom = max([rectangle.end_y for rectangle in line])
            top = min([rectangle.start_y for rectangle in line])

            grouped = False
            for group_center in groups.keys():
                if bottom >= group_center >= top:
                    groups[group_center].append(line)
                    grouped = True
            if not grouped:
                groups[center] = [line]
    
    return [v for _, v in groups.items()]




[docs]
def general_text_area(candidates, roi, is_number):
    """Process text area

    Args:
        candidates (list of lists): identified rectangles intersecting ROI
        roi (ROI): given ROI
        is_number (bool): True if number(s) is/are expected

    Returns:
        str: extracted text
    """
    # seperate each by lines
    candidate_lines = dict()
    for key in candidates.keys():
        if candidates[key]:
            lines = separate_to_lines(candidates[key])
            for line in lines:
                line.sort()
            candidate_lines[key] = lines

    results = dict()

    max_words, max_lines = get_max_dimensions(candidate_lines)

    # if the text area is reasonably small
    if max_lines <= 3 and max_words <= 5:
        for key in candidate_lines:
            results[key] = construct_lines(candidate_lines[key])
    
        aligned_groups = align_lines(candidate_lines.values())
        words = []
        for group in aligned_groups:
            word = process_lines(group, roi, is_number)
            if word:
                words.append(word.strip())
        results['inferred'] = '\n'.join(words)
    else:
        results['inferred'] = construct_lines(list(candidate_lines.values())[0])

    return results