Source code for formhtr.libs.processing.read_content

from .rtree import Ensemble
from .barcode import read_barcode
from .process_area import general_text_area
from .checkbox import is_ticked



[docs]
def process_content(indetified_content, logsheet_image, config, checkbox_edges):
    """Fill each configured ROI using OCR unions, barcodes, or checkbox heuristics.

    Args:
        indetified_content: Dict ``google`` / ``amazon`` / ``azure`` -> word lists or ``None``.
        logsheet_image: Full-page aligned image as ``numpy`` array.
        config: ``LogsheetConfig`` with ``regions`` and ``residuals``.
        checkbox_edges: Fraction of ROI border to ignore when scoring checkbox ink.

    Returns:
        ``(results, artefacts)`` where ``results`` is a list of
        ``[varname, content_dict, fragment]`` and ``artefacts`` maps each service
        to leftover OCR snippets not assigned to ROIs.
    """
    results = []
    artefacts = dict()
    
    ensemble = Ensemble(indetified_content, config)

    for region in config.regions:
        fragment = logsheet_image[region.start_y:region.end_y, region.start_x:region.end_x]
        content = dict()

        candidates = ensemble.find_intersection(region.get_coords())
        
        if region.content_type == 'Barcode':
            content['inferred'] = read_barcode(fragment, candidates)
        elif region.content_type == 'Checkbox':
            content['inferred'] = is_ticked(fragment, edge_ignore_percentage=checkbox_edges)
        else:
            is_number = region.content_type == 'Number'
            content = general_text_area(candidates, region, is_number)

        results.append([region.varname, content, fragment])
    
    for key, remaining in ensemble.filter_artefacts().items():
        artefacts[key] = []
        for rectangle in remaining:
            artefacts[key].append([rectangle.content, logsheet_image[int(rectangle.start_y):int(rectangle.end_y), 
                                                                     int(rectangle.start_x):int(rectangle.end_x)]])
    
    return results, artefacts