Source code for formhtr.logsheet
from __future__ import annotations
import json
from dataclasses import dataclass
from typing import Any
import numpy as np
from .libs.logsheet_config import LogsheetConfig
from .libs.pdf_to_image import (convert_pdf_to_image, get_image_size,
resize_image)
from .libs.processing.align_images import align_images
from .libs.processing.read_content import process_content
from .libs.processing.store_results import store_results, store_results_csv
from .libs.services.call_services import call_services
from .libs.statistics import compute_success_ratio
from .libs.visualise_regions import annotate_pdfs
from .manual_align import align_page
[docs]
@dataclass(frozen=True)
class ServiceCredentials:
"""Holds OCR credentials for ``call_services``.
Attributes:
google_credentials_path: Path to Google service-account JSON, or ``None``.
amazon_credentials: Loaded Amazon credentials dict, or ``None``.
azure_credentials: Loaded Azure credentials dict, or ``None``.
"""
google_credentials_path: str | None
amazon_credentials: dict[str, Any] | None
azure_credentials: dict[str, Any] | None
[docs]
def load_credentials(
*,
google_credentials_path: str | None = None,
amazon_credentials_path: str | None = None,
azure_credentials_path: str | None = None,
) -> ServiceCredentials:
"""Load credential files into a ``ServiceCredentials`` instance.
Args:
google_credentials_path: Path to Google JSON (not loaded here).
amazon_credentials_path: Path to Amazon JSON (``ACCESS_KEY``, ``SECRET_KEY``, ``REGION``).
azure_credentials_path: Path to Azure JSON (``SUBSCRIPTION_KEY``, ``ENDPOINT``).
Returns:
Frozen dataclass with paths/dicts for enabled providers.
"""
amazon_credentials = None
azure_credentials = None
if amazon_credentials_path is not None:
with open(amazon_credentials_path, 'r') as f:
amazon_credentials = json.load(f)
if azure_credentials_path is not None:
with open(azure_credentials_path, 'r') as f:
azure_credentials = json.load(f)
return ServiceCredentials(
google_credentials_path=google_credentials_path,
amazon_credentials=amazon_credentials,
azure_credentials=azure_credentials,
)
[docs]
def preprocess_input(
*,
scanned_logsheet_pdf: str,
template_pdf: str,
config: LogsheetConfig,
page: int,
skip_alignment: bool,
filter_grayscale: bool,
max_size_mb: float = 4,
dpi: int = 300,
alignment_config_path: str | None = None,
):
"""Rasterize PDFs, align scan to template, and enforce a maximum JPEG size.
Args:
scanned_logsheet_pdf: Path to the scanned logsheet PDF.
template_pdf: Path to the blank template PDF.
config: Loaded layout (width/height used for resizing).
page: Page index in the scan PDF.
skip_alignment: If True, skip homography alignment.
filter_grayscale: Passed to automatic alignment (edge-based corners).
max_size_mb: If the in-memory JPEG exceeds this, reduce ``dpi`` and retry.
dpi: Initial rasterization DPI.
alignment_config_path: Optional JSON with ``template_points`` and ``target_points``.
Returns:
Aligned logsheet as a ``numpy`` array, or ``None`` if alignment yields no image.
"""
template_image = np.array(convert_pdf_to_image(template_pdf, dpi=dpi))
logsheet_image = np.array(convert_pdf_to_image(
scanned_logsheet_pdf, page, dpi=dpi))
logsheet_image = resize_image(
logsheet_image, (config.width, config.height))
template_image = resize_image(
template_image, (config.width, config.height))
if not skip_alignment and alignment_config_path is None:
logsheet_image = align_images(
logsheet_image, template_image, filter_grayscale)
elif not skip_alignment and alignment_config_path is not None:
with open(alignment_config_path, 'r') as f:
align_config = json.load(f)
template_points = align_config['template_points']
target_points = align_config['target_points']
logsheet_image = align_page(logsheet_image, template_image,
template_points=template_points,
target_points=target_points)
if logsheet_image is None:
return None
if get_image_size(logsheet_image) > max_size_mb * 2**20:
if dpi <= 50:
return logsheet_image
return preprocess_input(
scanned_logsheet_pdf=scanned_logsheet_pdf,
template_pdf=template_pdf,
config=config,
page=page,
skip_alignment=skip_alignment,
filter_grayscale=filter_grayscale,
max_size_mb=max_size_mb,
dpi=dpi - 50,
alignment_config_path=alignment_config_path,
)
return logsheet_image
[docs]
def extract_logsheet(
*,
scanned_logsheet_pdf: str,
template_pdf: str,
config_json: str,
credentials: ServiceCredentials,
debug: bool = False,
front: bool = True,
checkbox_edges: float = 0.2,
skip_alignment: bool = False,
filter_grayscale: bool = False,
alignment_config_path: str | None = None,
):
"""Preprocess one page, run OCR services, optionally write debug PDFs, parse ROIs.
Args:
scanned_logsheet_pdf: Path to the scanned PDF.
template_pdf: Path to the template PDF.
config_json: Path to ROI/residual JSON config.
credentials: Provider credentials (any subset may be set).
debug: If True, write annotated debug PDFs under ``debug/``.
front: If True, use page 0; else page 1.
checkbox_edges: Inner margin ratio for checkbox tick detection.
skip_alignment: Skip alignment in preprocessing.
filter_grayscale: Passed to automatic alignment.
alignment_config_path: Optional manual alignment JSON.
Returns:
``(results, artefacts)`` from ``process_content``, or ``(None, None)`` if preprocess fails.
"""
config = LogsheetConfig([], [])
config.import_from_json(config_json)
page = 0 if front else 1
logsheet_image = preprocess_input(
scanned_logsheet_pdf=scanned_logsheet_pdf,
template_pdf=template_pdf,
config=config,
page=page,
skip_alignment=skip_alignment,
filter_grayscale=filter_grayscale,
alignment_config_path=alignment_config_path,
)
if logsheet_image is None:
return None, None
identified_content = call_services(
logsheet_image,
{
"google": credentials.google_credentials_path,
"amazon": credentials.amazon_credentials,
"azure": credentials.azure_credentials,
},
config,
)
if debug:
annotate_pdfs(identified_content, logsheet_image, front)
return process_content(identified_content, logsheet_image, config, checkbox_edges)
[docs]
def process_logsheet_to_xlsx(
*,
scanned_logsheet_pdf: str,
template_pdf: str,
config_json: str,
output_xlsx: str,
credentials: ServiceCredentials,
debug: bool = False,
backside: bool = False,
backside_template_pdf: str | None = None,
backside_config_json: str | None = None,
ugly_checkboxes: bool = False,
already_aligned: bool = False,
filter_grayscale: bool = False,
store_csv: bool = False,
alignment_config_path: str | None = None,
backside_alignment_config_path: str | None = None,
) -> float | None:
"""End-to-end extraction to spreadsheet or CSV, optionally both sides of a scan.
Args:
scanned_logsheet_pdf: Path to the scanned PDF.
template_pdf: Front template PDF path.
config_json: Front ROI config JSON path.
output_xlsx: Output ``.xlsx`` or ``.csv`` path (see ``store_csv``).
credentials: OCR credentials for enabled providers.
debug: Enable debug PDF output during extraction.
backside: Whether to append back-side ROIs.
backside_template_pdf: Back template PDF (required if ``backside``).
backside_config_json: Back config JSON (required if ``backside``).
ugly_checkboxes: Use a larger edge ignore ratio for checkboxes.
already_aligned: Skip alignment in ``preprocess_input``.
filter_grayscale: Passed to automatic alignment.
store_csv: If True, write CSV via ``store_results_csv`` instead of XLSX.
alignment_config_path: Optional front alignment JSON.
backside_alignment_config_path: Optional back alignment JSON.
Returns:
Dict from ``compute_success_ratio`` (``identified``, ``artefacts``, ``ratio``),
or ``None`` if the front side could not be processed.
"""
checkbox_edges = 0.4 if ugly_checkboxes else 0.2
contents, artefacts = extract_logsheet(
scanned_logsheet_pdf=scanned_logsheet_pdf,
template_pdf=template_pdf,
config_json=config_json,
credentials=credentials,
debug=debug,
checkbox_edges=checkbox_edges,
skip_alignment=already_aligned,
filter_grayscale=filter_grayscale,
front=True,
alignment_config_path=alignment_config_path,
)
if contents is None:
return None
if backside:
if not backside_template_pdf or not backside_config_json:
raise ValueError(
"backside_template_pdf and backside_config_json are required when backside=True")
try:
contents_back, artefacts_back = extract_logsheet(
scanned_logsheet_pdf=scanned_logsheet_pdf,
template_pdf=backside_template_pdf,
config_json=backside_config_json,
credentials=credentials,
debug=debug,
checkbox_edges=checkbox_edges,
skip_alignment=already_aligned,
filter_grayscale=filter_grayscale,
front=False,
alignment_config_path=backside_alignment_config_path,
)
if contents_back is not None and artefacts_back is not None:
contents += contents_back
for key in artefacts.keys():
artefacts[key] = artefacts[key] + artefacts_back[key]
except ValueError:
# backside present but actually a blank page
pass
ratio = compute_success_ratio(contents, artefacts)
if not store_csv:
# store to Excel sheet
store_results(contents, artefacts, output_xlsx)
else:
store_results_csv(contents, artefacts, output_xlsx)
return ratio