Source code for formhtr.logsheet

from __future__ import annotations

import json
from dataclasses import dataclass
from typing import Any

import numpy as np

from .libs.logsheet_config import LogsheetConfig
from .libs.pdf_to_image import (convert_pdf_to_image, get_image_size,
                                resize_image)
from .libs.processing.align_images import align_images
from .libs.processing.read_content import process_content
from .libs.processing.store_results import store_results, store_results_csv
from .libs.services.call_services import call_services
from .libs.statistics import compute_success_ratio
from .libs.visualise_regions import annotate_pdfs
from .manual_align import align_page



[docs]
@dataclass(frozen=True)
class ServiceCredentials:
    """Holds OCR credentials for ``call_services``.

    Attributes:
        google_credentials_path: Path to Google service-account JSON, or ``None``.
        amazon_credentials: Loaded Amazon credentials dict, or ``None``.
        azure_credentials: Loaded Azure credentials dict, or ``None``.
    """

    google_credentials_path: str | None
    amazon_credentials: dict[str, Any] | None
    azure_credentials: dict[str, Any] | None




[docs]
def load_credentials(
    *,
    google_credentials_path: str | None = None,
    amazon_credentials_path: str | None = None,
    azure_credentials_path: str | None = None,
) -> ServiceCredentials:
    """Load credential files into a ``ServiceCredentials`` instance.

    Args:
        google_credentials_path: Path to Google JSON (not loaded here).
        amazon_credentials_path: Path to Amazon JSON (``ACCESS_KEY``, ``SECRET_KEY``, ``REGION``).
        azure_credentials_path: Path to Azure JSON (``SUBSCRIPTION_KEY``, ``ENDPOINT``).

    Returns:
        Frozen dataclass with paths/dicts for enabled providers.
    """
    amazon_credentials = None
    azure_credentials = None

    if amazon_credentials_path is not None:
        with open(amazon_credentials_path, 'r') as f:
            amazon_credentials = json.load(f)

    if azure_credentials_path is not None:
        with open(azure_credentials_path, 'r') as f:
            azure_credentials = json.load(f)

    return ServiceCredentials(
        google_credentials_path=google_credentials_path,
        amazon_credentials=amazon_credentials,
        azure_credentials=azure_credentials,
    )




[docs]
def preprocess_input(
    *,
    scanned_logsheet_pdf: str,
    template_pdf: str,
    config: LogsheetConfig,
    page: int,
    skip_alignment: bool,
    filter_grayscale: bool,
    max_size_mb: float = 4,
    dpi: int = 300,
    alignment_config_path: str | None = None,
):
    """Rasterize PDFs, align scan to template, and enforce a maximum JPEG size.

    Args:
        scanned_logsheet_pdf: Path to the scanned logsheet PDF.
        template_pdf: Path to the blank template PDF.
        config: Loaded layout (width/height used for resizing).
        page: Page index in the scan PDF.
        skip_alignment: If True, skip homography alignment.
        filter_grayscale: Passed to automatic alignment (edge-based corners).
        max_size_mb: If the in-memory JPEG exceeds this, reduce ``dpi`` and retry.
        dpi: Initial rasterization DPI.
        alignment_config_path: Optional JSON with ``template_points`` and ``target_points``.

    Returns:
        Aligned logsheet as a ``numpy`` array, or ``None`` if alignment yields no image.
    """
    template_image = np.array(convert_pdf_to_image(template_pdf, dpi=dpi))
    logsheet_image = np.array(convert_pdf_to_image(
        scanned_logsheet_pdf, page, dpi=dpi))

    logsheet_image = resize_image(
        logsheet_image, (config.width, config.height))
    template_image = resize_image(
        template_image, (config.width, config.height))

    if not skip_alignment and alignment_config_path is None:
        logsheet_image = align_images(
            logsheet_image, template_image, filter_grayscale)

    elif not skip_alignment and alignment_config_path is not None:
        with open(alignment_config_path, 'r') as f:
            align_config = json.load(f)

        template_points = align_config['template_points']
        target_points = align_config['target_points']
        logsheet_image = align_page(logsheet_image, template_image,
                                    template_points=template_points,
                                    target_points=target_points)

    if logsheet_image is None:
        return None

    if get_image_size(logsheet_image) > max_size_mb * 2**20:
        if dpi <= 50:
            return logsheet_image
        return preprocess_input(
            scanned_logsheet_pdf=scanned_logsheet_pdf,
            template_pdf=template_pdf,
            config=config,
            page=page,
            skip_alignment=skip_alignment,
            filter_grayscale=filter_grayscale,
            max_size_mb=max_size_mb,
            dpi=dpi - 50,
            alignment_config_path=alignment_config_path,
        )

    return logsheet_image




[docs]
def extract_logsheet(
    *,
    scanned_logsheet_pdf: str,
    template_pdf: str,
    config_json: str,
    credentials: ServiceCredentials,
    debug: bool = False,
    front: bool = True,
    checkbox_edges: float = 0.2,
    skip_alignment: bool = False,
    filter_grayscale: bool = False,
    alignment_config_path: str | None = None,
):
    """Preprocess one page, run OCR services, optionally write debug PDFs, parse ROIs.

    Args:
        scanned_logsheet_pdf: Path to the scanned PDF.
        template_pdf: Path to the template PDF.
        config_json: Path to ROI/residual JSON config.
        credentials: Provider credentials (any subset may be set).
        debug: If True, write annotated debug PDFs under ``debug/``.
        front: If True, use page 0; else page 1.
        checkbox_edges: Inner margin ratio for checkbox tick detection.
        skip_alignment: Skip alignment in preprocessing.
        filter_grayscale: Passed to automatic alignment.
        alignment_config_path: Optional manual alignment JSON.

    Returns:
        ``(results, artefacts)`` from ``process_content``, or ``(None, None)`` if preprocess fails.
    """
    config = LogsheetConfig([], [])
    config.import_from_json(config_json)

    page = 0 if front else 1

    logsheet_image = preprocess_input(
        scanned_logsheet_pdf=scanned_logsheet_pdf,
        template_pdf=template_pdf,
        config=config,
        page=page,
        skip_alignment=skip_alignment,
        filter_grayscale=filter_grayscale,
        alignment_config_path=alignment_config_path,
    )
    if logsheet_image is None:
        return None, None

    identified_content = call_services(
        logsheet_image,
        {
            "google": credentials.google_credentials_path,
            "amazon": credentials.amazon_credentials,
            "azure": credentials.azure_credentials,
        },
        config,
    )

    if debug:
        annotate_pdfs(identified_content, logsheet_image, front)

    return process_content(identified_content, logsheet_image, config, checkbox_edges)




[docs]
def process_logsheet_to_xlsx(
    *,
    scanned_logsheet_pdf: str,
    template_pdf: str,
    config_json: str,
    output_xlsx: str,
    credentials: ServiceCredentials,
    debug: bool = False,
    backside: bool = False,
    backside_template_pdf: str | None = None,
    backside_config_json: str | None = None,
    ugly_checkboxes: bool = False,
    already_aligned: bool = False,
    filter_grayscale: bool = False,
    store_csv: bool = False,
    alignment_config_path: str | None = None,
    backside_alignment_config_path: str | None = None,
) -> float | None:
    """End-to-end extraction to spreadsheet or CSV, optionally both sides of a scan.

    Args:
        scanned_logsheet_pdf: Path to the scanned PDF.
        template_pdf: Front template PDF path.
        config_json: Front ROI config JSON path.
        output_xlsx: Output ``.xlsx`` or ``.csv`` path (see ``store_csv``).
        credentials: OCR credentials for enabled providers.
        debug: Enable debug PDF output during extraction.
        backside: Whether to append back-side ROIs.
        backside_template_pdf: Back template PDF (required if ``backside``).
        backside_config_json: Back config JSON (required if ``backside``).
        ugly_checkboxes: Use a larger edge ignore ratio for checkboxes.
        already_aligned: Skip alignment in ``preprocess_input``.
        filter_grayscale: Passed to automatic alignment.
        store_csv: If True, write CSV via ``store_results_csv`` instead of XLSX.
        alignment_config_path: Optional front alignment JSON.
        backside_alignment_config_path: Optional back alignment JSON.

    Returns:
        Dict from ``compute_success_ratio`` (``identified``, ``artefacts``, ``ratio``),
        or ``None`` if the front side could not be processed.
    """
    checkbox_edges = 0.4 if ugly_checkboxes else 0.2

    contents, artefacts = extract_logsheet(
        scanned_logsheet_pdf=scanned_logsheet_pdf,
        template_pdf=template_pdf,
        config_json=config_json,
        credentials=credentials,
        debug=debug,
        checkbox_edges=checkbox_edges,
        skip_alignment=already_aligned,
        filter_grayscale=filter_grayscale,
        front=True,
        alignment_config_path=alignment_config_path,
    )

    if contents is None:
        return None

    if backside:
        if not backside_template_pdf or not backside_config_json:
            raise ValueError(
                "backside_template_pdf and backside_config_json are required when backside=True")

        try:
            contents_back, artefacts_back = extract_logsheet(
                scanned_logsheet_pdf=scanned_logsheet_pdf,
                template_pdf=backside_template_pdf,
                config_json=backside_config_json,
                credentials=credentials,
                debug=debug,
                checkbox_edges=checkbox_edges,
                skip_alignment=already_aligned,
                filter_grayscale=filter_grayscale,
                front=False,
                alignment_config_path=backside_alignment_config_path,
            )
            if contents_back is not None and artefacts_back is not None:
                contents += contents_back
                for key in artefacts.keys():
                    artefacts[key] = artefacts[key] + artefacts_back[key]
        except ValueError:
            # backside present but actually a blank page
            pass

    ratio = compute_success_ratio(contents, artefacts)
    if not store_csv:
        # store to Excel sheet
        store_results(contents, artefacts, output_xlsx)
    else:
        store_results_csv(contents, artefacts, output_xlsx)

    return ratio