changes

2026-05-16 13:18:52 -04:00
parent 191a9c82be
commit 9ebf89bde2
1080 changed files with 0 additions and 1197 deletions
--- a/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/init.py
+++ b/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/init.py
@@ -0,0 +1,3 @@
+from . import base
+from . import tesseract_adapter
+from . import manual_adapter
--- a/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/base.py
+++ b/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/base.py
@@ -0,0 +1,40 @@
+"""OCRProvider contract - every backend must conform.
+
+Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
+(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
+tesseract adapter without touching account.move.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class OCRResult:
+    raw_text: str = ''
+    confidence: float = 0.0  # 0.0–1.0
+    pages: int = 0
+    backend: str = ''
+    error: str = ''
+    metadata: dict = field(default_factory=dict)
+
+
+class OCRProvider(ABC):
+    """Abstract OCR backend. Subclasses implement extract()."""
+
+    name: str = 'base'
+
+    @abstractmethod
+    def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
+        """Extract text from raw bytes.
+
+        ``mimetype`` hints whether to PDF-render (poppler) or image-decode
+        (PIL) the bytes. Implementations should still inspect the byte
+        signature for safety.
+        """
+        ...
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Return True if the backend's runtime deps are present."""
+        return True
--- a/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
+++ b/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
@@ -0,0 +1,13 @@
+"""Manual fallback adapter - no real OCR, just marks the document as
+'awaiting manual entry'. Used when no real OCR backend is available
+or when the user explicitly disables OCR.
+"""
+
+from .base import OCRProvider, OCRResult
+
+
+class ManualAdapter(OCRProvider):
+    name = 'manual'
+
+    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
+        return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')
--- a/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
+++ b/fusion_accounting/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
@@ -0,0 +1,71 @@
+"""Tesseract OCR adapter.
+
+Uses the system tesseract binary via pytesseract, with poppler-backed
+PDF rendering via pdf2image. Inside the container these are pre-installed:
+- tesseract-ocr 5.3.4
+- pytesseract 0.3.13
+- pdf2image 1.17.0
+- poppler-utils
+"""
+
+import io
+import logging
+
+from .base import OCRProvider, OCRResult
+
+_logger = logging.getLogger(__name__)
+
+
+class TesseractAdapter(OCRProvider):
+    name = 'tesseract'
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            import pytesseract
+            from pdf2image import convert_from_bytes  # noqa: F401
+            from PIL import Image  # noqa: F401
+            pytesseract.get_tesseract_version()
+            return True
+        except Exception as e:
+            _logger.debug("TesseractAdapter not available: %s", e)
+            return False
+
+    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
+        import pytesseract
+        from pdf2image import convert_from_bytes
+        from PIL import Image
+
+        try:
+            is_pdf = (
+                mimetype == 'application/pdf'
+                or (image_or_pdf_bytes[:4] == b'%PDF')
+            )
+            if is_pdf:
+                pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
+            else:
+                img = Image.open(io.BytesIO(image_or_pdf_bytes))
+                pages = [img]
+
+            texts = []
+            for p in pages:
+                texts.append(pytesseract.image_to_string(p))
+            full_text = '\n\f\n'.join(texts)
+
+            # Heuristic confidence - tesseract has a per-word conf in
+            # image_to_data, but a length proxy is fine for routing
+            # decisions. Future: use pytesseract.image_to_data for a real
+            # average word-level confidence.
+            conf = min(1.0, len(full_text) / 1000.0)
+            return OCRResult(
+                raw_text=full_text,
+                confidence=conf,
+                pages=len(pages),
+                backend='tesseract',
+            )
+        except Exception as e:
+            _logger.warning("Tesseract OCR failed: %s", e)
+            return OCRResult(
+                raw_text='', confidence=0.0, pages=0,
+                backend='tesseract', error=str(e),
+            )