Source code for formhtr.libs.processing.store_results

import csv
import os
from shutil import rmtree

import cv2
import xlsxwriter



[docs]
def order_results(values):
    """Collect non-empty per-provider values in column order.

    Args:
        values: Dict with optional keys ``inferred``, ``google``, ``amazon``, ``azure``.

    Returns:
        List of truthy values in fixed key order.
    """
    output = []
    for key in ['inferred', 'google', 'amazon', 'azure']:
        value = values.get(key, None)
        if value:
            output.append(value)
    return output




[docs]
def write_header(worksheet):
    """Write the main metadata sheet column titles.

    Args:
        worksheet: ``xlsxwriter`` worksheet for the Metadata tab.

    Returns:
        ``None``.
    """
    worksheet.write('A1', 'Variable name')
    worksheet.write('B1', 'Extracted content')
    worksheet.write('C1', 'Cropped image')




[docs]
def store_image(image, location, index):
    """
    Temporarily store image

    Args:
        image (Image): image to be stored
        location (str): directory where to store the image
        index (int): unique identifier of the image

    Returns:
        Absolute path string of the written PNG file.
    """
    filename = f'{location}/cropped_image_{index}.png'
    cv2.imwrite(filename, image, [cv2.IMWRITE_PNG_COMPRESSION, 9])
    return filename




[docs]
def store_results(results, artefacts, output_file, include_validation=False):
    """Write ROI results and artefact crops into an XLSX workbook.

    Args:
        results: List of ``[varname, value_dict, crop_numpy]`` rows.
        artefacts: Dict mapping service name to ``[text, crop_numpy]`` lists.
        output_file: Path to the ``.xlsx`` file to create.
        include_validation: If True, add Excel data validation where applicable.

    Returns:
        ``None``. Temporary PNG crops next to ``output_file`` are removed after close.
    """
    # create directory to store mini images
    directory = os.path.dirname(output_file)
    images_directory = os.path.join(directory, 'images')

    if not os.path.exists(images_directory):
        os.makedirs(images_directory)

    # create a new Excel file and add a worksheet
    workbook = xlsxwriter.Workbook(output_file)
    worksheet = workbook.add_worksheet('Metadata')
    write_header(worksheet)

    max_width = 0

    bool_format = workbook.add_format({'bg_color': '#f1e740'})

    # fill in data
    for row_number, result in enumerate(results, 2):
        worksheet.write(f'A{row_number}', result[0])
        values = order_results(result[1])
        if include_validation and len(values) > 1:
            worksheet.data_validation(
                f'B{row_number}', {'validate': 'list', 'show_error': False, 'source': values})

        inferred = result[1].get('inferred', None)
        if inferred is None and len(values) != 0:
            inferred = values[0]

        worksheet.write(f'B{row_number}', inferred)

        if type(inferred) == bool:
            if include_validation:
                worksheet.data_validation(f'B{row_number}', {
                                          'validate': 'list', 'show_error': False, 'source': [True, False]})
            worksheet.conditional_format(f'B{row_number}', {'type': 'cell',
                                         'criteria': '==',
                                                            'value': True,
                                                            'format': bool_format})

        filename = store_image(result[2], images_directory, row_number)
        height, width, _ = result[2].shape
        max_width = max(width, max_width)
        worksheet.insert_image(f'C{row_number}', filename)
        worksheet.set_row_pixels(row_number-1, height)

    worksheet.set_column_pixels(2, 3, max_width)
    worksheet.autofit()

    max_width = 0

    # add extra identified content
    extra_worksheet = workbook.add_worksheet('Extra')
    row_number = 1
    for key in artefacts.keys():
        if len(artefacts[key]) != 0:
            extra_worksheet.write(f'A{row_number}', key)
            row_number += 1
            for extra in artefacts[key]:
                if extra[1].size != 0:
                    extra_worksheet.write(f'A{row_number}', extra[0])

                    filename = store_image(
                        extra[1], images_directory, row_number+1000)
                    height, width, _ = extra[1].shape
                    max_width = max(width, max_width)
                    extra_worksheet.insert_image(f'B{row_number}', filename)
                    extra_worksheet.set_row_pixels(row_number-1, height)
                    row_number += 1
            row_number += 1

    extra_worksheet.set_column_pixels(1, 2, max_width)
    extra_worksheet.autofit()

    workbook.close()
    rmtree(images_directory)




[docs]
def store_results_csv(results, artefacts, output_file):
    """Write variable names and inferred values to UTF-8 CSV (no images).

    Args:
        results: List of ``[varname, value_dict, crop]`` (crop ignored).
        artefacts: Unused; kept for API symmetry with ``store_results``.
        output_file: Path to the ``.csv`` file to create.

    Returns:
        ``None``.
    """
    with open(output_file, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)

        writer.writerow(['varname', 'inferred value'])

        for result in results:
            row_id = result[0]
            data_dict = result[1]

            values = order_results(data_dict)

            inferred = data_dict.get('inferred', None)
            if inferred is None and len(values) != 0:
                inferred = values[0]

            writer.writerow([row_id, inferred])