feat(fusion_accounting_ocr): pluggable OCR for vendor bills

Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline: Stage 1 (text extraction): Tesseract OCRs the bill attachment via pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows future Mindee / Google Document AI / Ollama-vision backends. Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the raw OCR text and returns structured invoice fields (vendor, invoice number, dates, amounts, line items) as JSON. Draft invoice fields are auto-populated for empty-only fields (never overwriting user-entered data). Vendor matching by name against res.partner with supplier_rank > 0. Adds: - account.move.ocr_state (selection: not_requested/pending/processing/ done/failed/manual) - account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend, ocr_confidence - fusion.ocr.log (audit trail per OCR run) - res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run - /fusion/ocr/request_for_invoice JSON-RPC endpoint Backend availability detected at runtime via OCRProvider.is_available() classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0 are installed in the container. Tests: 13 (TesseractAdapter availability + image OCR; flow tests for draft autofill, no-attachment guard, customer-invoice guard, ref-not- overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/ provider-exception). All pass on westin-v19 OrbStack VM. Made-with: Cursor
2026-04-20 00:32:50 -04:00
parent a730942d24
commit 125f48377a
24 changed files with 952 additions and 0 deletions
--- a/fusion_accounting_ocr/services/init.py
+++ b/fusion_accounting_ocr/services/init.py
@@ -0,0 +1,3 @@
+from . import ocr_providers
+from . import attachment_to_image
+from . import invoice_field_parser
--- a/fusion_accounting_ocr/services/attachment_to_image.py
+++ b/fusion_accounting_ocr/services/attachment_to_image.py
@@ -0,0 +1,43 @@
+"""Helper: turn an ir.attachment into a list of PIL.Image pages.
+
+Kept separate from the adapters so future backends (Ollama-vision, Mindee)
+that want PIL images directly don't have to re-implement the PDF rendering.
+"""
+
+import base64
+import io
+import logging
+
+_logger = logging.getLogger(__name__)
+
+
+def attachment_to_pages(attachment):
+    """Decode an ir.attachment into a list of PIL.Image pages.
+
+    Returns ``[]`` on failure (caller should treat as no pages).
+    """
+    try:
+        from PIL import Image
+        from pdf2image import convert_from_bytes
+    except ImportError as e:
+        _logger.warning("attachment_to_pages requires PIL + pdf2image: %s", e)
+        return []
+
+    if not attachment or not attachment.datas:
+        return []
+
+    try:
+        data = base64.b64decode(attachment.datas)
+    except Exception as e:
+        _logger.warning("Could not decode attachment %s: %s", attachment.id, e)
+        return []
+
+    mimetype = attachment.mimetype or ''
+    is_pdf = mimetype == 'application/pdf' or data[:4] == b'%PDF'
+    try:
+        if is_pdf:
+            return convert_from_bytes(data, dpi=200)
+        return [Image.open(io.BytesIO(data))]
+    except Exception as e:
+        _logger.warning("Could not render attachment %s: %s", attachment.id, e)
+        return []
--- a/fusion_accounting_ocr/services/invoice_field_parser.py
+++ b/fusion_accounting_ocr/services/invoice_field_parser.py
@@ -0,0 +1,150 @@
+"""Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice
+fields via the configured LLM provider.
+
+Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py:
+look up an adapter by ir.config_parameter, fall back gracefully when no
+provider is configured, and never let an LLM hiccup nuke the OCR result.
+"""
+
+import json
+import logging
+
+_logger = logging.getLogger(__name__)
+
+
+SYSTEM_PROMPT = (
+    "You are an invoice field extraction assistant. You read raw OCR text "
+    "from vendor bills and return a strict JSON object with the requested "
+    "fields. You never include commentary or markdown fences. When a field "
+    "cannot be determined from the text you return null for that field."
+)
+
+USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object
+with these fields (use null when unclear):
+
+{{
+  "vendor_name": <string, the seller/vendor company name>,
+  "invoice_number": <string, the bill or invoice reference number>,
+  "invoice_date": <string, ISO format YYYY-MM-DD>,
+  "due_date": <string or null, ISO format YYYY-MM-DD>,
+  "currency": <string, ISO 4217 code like CAD/USD/EUR>,
+  "subtotal": <number or null>,
+  "tax_total": <number or null>,
+  "total": <number, the grand total amount due>,
+  "line_items": [
+    {{"description": <string>, "quantity": <number or null>,
+      "unit_price": <number or null>, "amount": <number or null>}}
+  ]
+}}
+
+Return ONLY valid JSON, no commentary, no markdown fences.
+
+Raw OCR text:
+---
+{text}
+---
+"""
+
+
+def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict:
+    """Use the configured LLM provider to extract structured invoice fields.
+
+    Returns a dict with the schema above. On any failure (no provider, bad
+    JSON, network error, etc.) returns an all-null result so the OCR raw
+    text is still preserved for the AP user.
+    """
+    if not raw_text or not raw_text.strip():
+        return _empty_result()
+
+    if provider is None:
+        provider = _get_provider(env)
+    if provider is None:
+        _logger.info(
+            "No LLM provider configured for OCR field parsing; "
+            "raw OCR text preserved, fields left empty."
+        )
+        return _empty_result()
+
+    try:
+        truncated = raw_text[:12000]
+        user = USER_PROMPT.format(text=truncated)
+        response = provider.complete(
+            system=SYSTEM_PROMPT,
+            messages=[{'role': 'user', 'content': user}],
+            max_tokens=1000,
+            temperature=0.1,
+        )
+        content = response.get('content') if isinstance(response, dict) else response
+        if not content:
+            return _empty_result()
+
+        # LLMs sometimes wrap JSON in ```json ... ``` despite instructions.
+        content = content.strip()
+        if content.startswith('```'):
+            content = content.split('```', 2)[1]
+            if content.startswith('json'):
+                content = content[4:]
+            content = content.rsplit('```', 1)[0]
+
+        parsed = json.loads(content.strip())
+        return {
+            'vendor_name': parsed.get('vendor_name'),
+            'invoice_number': parsed.get('invoice_number'),
+            'invoice_date': parsed.get('invoice_date'),
+            'due_date': parsed.get('due_date'),
+            'currency': parsed.get('currency'),
+            'subtotal': parsed.get('subtotal'),
+            'tax_total': parsed.get('tax_total'),
+            'total': parsed.get('total'),
+            'line_items': parsed.get('line_items') or [],
+        }
+    except json.JSONDecodeError as e:
+        _logger.warning("LLM returned non-JSON for OCR field parsing: %s", e)
+        return _empty_result()
+    except Exception as e:
+        _logger.warning("OCR field parsing failed: %s", e)
+        return _empty_result()
+
+
+def _empty_result():
+    return {
+        'vendor_name': None,
+        'invoice_number': None,
+        'invoice_date': None,
+        'due_date': None,
+        'currency': None,
+        'subtotal': None,
+        'tax_total': None,
+        'total': None,
+        'line_items': [],
+    }
+
+
+def _get_provider(env):
+    """Look up the LLM adapter via ir.config_parameter.
+
+    Honours a feature-specific override
+    (``fusion_accounting.provider.ocr_field_parsing``) before falling back
+    to the suite-wide default (``fusion_accounting.provider.default``).
+    Returns None when no adapter is configured/importable.
+    """
+    param = env['ir.config_parameter'].sudo()
+    name = param.get_param('fusion_accounting.provider.ocr_field_parsing')
+    if not name:
+        name = param.get_param('fusion_accounting.provider.default')
+    if not name:
+        return None
+    try:
+        from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter
+        from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter
+    except ImportError:
+        return None
+    try:
+        if name.startswith('openai'):
+            return OpenAIAdapter(env)
+        if name.startswith('claude'):
+            return ClaudeAdapter(env)
+    except Exception as e:
+        _logger.warning("OCR field parser could not instantiate %s: %s", name, e)
+        return None
+    return None
--- a/fusion_accounting_ocr/services/ocr_providers/init.py
+++ b/fusion_accounting_ocr/services/ocr_providers/init.py
@@ -0,0 +1,3 @@
+from . import base
+from . import tesseract_adapter
+from . import manual_adapter
--- a/fusion_accounting_ocr/services/ocr_providers/base.py
+++ b/fusion_accounting_ocr/services/ocr_providers/base.py
@@ -0,0 +1,40 @@
+"""OCRProvider contract - every backend must conform.
+
+Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
+(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
+tesseract adapter without touching account.move.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+
+
+@dataclass
+class OCRResult:
+    raw_text: str = ''
+    confidence: float = 0.0  # 0.0–1.0
+    pages: int = 0
+    backend: str = ''
+    error: str = ''
+    metadata: dict = field(default_factory=dict)
+
+
+class OCRProvider(ABC):
+    """Abstract OCR backend. Subclasses implement extract()."""
+
+    name: str = 'base'
+
+    @abstractmethod
+    def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
+        """Extract text from raw bytes.
+
+        ``mimetype`` hints whether to PDF-render (poppler) or image-decode
+        (PIL) the bytes. Implementations should still inspect the byte
+        signature for safety.
+        """
+        ...
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Return True if the backend's runtime deps are present."""
+        return True
--- a/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
+++ b/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
@@ -0,0 +1,13 @@
+"""Manual fallback adapter - no real OCR, just marks the document as
+'awaiting manual entry'. Used when no real OCR backend is available
+or when the user explicitly disables OCR.
+"""
+
+from .base import OCRProvider, OCRResult
+
+
+class ManualAdapter(OCRProvider):
+    name = 'manual'
+
+    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
+        return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')
--- a/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
+++ b/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
@@ -0,0 +1,71 @@
+"""Tesseract OCR adapter.
+
+Uses the system tesseract binary via pytesseract, with poppler-backed
+PDF rendering via pdf2image. Inside the container these are pre-installed:
+- tesseract-ocr 5.3.4
+- pytesseract 0.3.13
+- pdf2image 1.17.0
+- poppler-utils
+"""
+
+import io
+import logging
+
+from .base import OCRProvider, OCRResult
+
+_logger = logging.getLogger(__name__)
+
+
+class TesseractAdapter(OCRProvider):
+    name = 'tesseract'
+
+    @classmethod
+    def is_available(cls) -> bool:
+        try:
+            import pytesseract
+            from pdf2image import convert_from_bytes  # noqa: F401
+            from PIL import Image  # noqa: F401
+            pytesseract.get_tesseract_version()
+            return True
+        except Exception as e:
+            _logger.debug("TesseractAdapter not available: %s", e)
+            return False
+
+    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
+        import pytesseract
+        from pdf2image import convert_from_bytes
+        from PIL import Image
+
+        try:
+            is_pdf = (
+                mimetype == 'application/pdf'
+                or (image_or_pdf_bytes[:4] == b'%PDF')
+            )
+            if is_pdf:
+                pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
+            else:
+                img = Image.open(io.BytesIO(image_or_pdf_bytes))
+                pages = [img]
+
+            texts = []
+            for p in pages:
+                texts.append(pytesseract.image_to_string(p))
+            full_text = '\n\f\n'.join(texts)
+
+            # Heuristic confidence - tesseract has a per-word conf in
+            # image_to_data, but a length proxy is fine for routing
+            # decisions. Future: use pytesseract.image_to_data for a real
+            # average word-level confidence.
+            conf = min(1.0, len(full_text) / 1000.0)
+            return OCRResult(
+                raw_text=full_text,
+                confidence=conf,
+                pages=len(pages),
+                backend='tesseract',
+            )
+        except Exception as e:
+            _logger.warning("Tesseract OCR failed: %s", e)
+            return OCRResult(
+                raw_text='', confidence=0.0, pages=0,
+                backend='tesseract', error=str(e),
+            )