feat(fusion_accounting_ocr): pluggable OCR for vendor bills

Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline:

Stage 1 (text extraction): Tesseract OCRs the bill attachment via
pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows
future Mindee / Google Document AI / Ollama-vision backends.

Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the
raw OCR text and returns structured invoice fields (vendor, invoice
number, dates, amounts, line items) as JSON.

Draft invoice fields are auto-populated for empty-only fields (never
overwriting user-entered data). Vendor matching by name against
res.partner with supplier_rank > 0.

Adds:
- account.move.ocr_state (selection: not_requested/pending/processing/
  done/failed/manual)
- account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend,
  ocr_confidence
- fusion.ocr.log (audit trail per OCR run)
- res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run
- /fusion/ocr/request_for_invoice JSON-RPC endpoint

Backend availability detected at runtime via OCRProvider.is_available()
classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0
are installed in the container.

Tests: 13 (TesseractAdapter availability + image OCR; flow tests for
draft autofill, no-attachment guard, customer-invoice guard, ref-not-
overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/
provider-exception). All pass on westin-v19 OrbStack VM.

Made-with: Cursor
This commit is contained in:
gsinghpal
2026-04-20 00:32:50 -04:00
parent a730942d24
commit 125f48377a
24 changed files with 952 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
from . import ocr_providers
from . import attachment_to_image
from . import invoice_field_parser

View File

@@ -0,0 +1,43 @@
"""Helper: turn an ir.attachment into a list of PIL.Image pages.
Kept separate from the adapters so future backends (Ollama-vision, Mindee)
that want PIL images directly don't have to re-implement the PDF rendering.
"""
import base64
import io
import logging
_logger = logging.getLogger(__name__)
def attachment_to_pages(attachment):
"""Decode an ir.attachment into a list of PIL.Image pages.
Returns ``[]`` on failure (caller should treat as no pages).
"""
try:
from PIL import Image
from pdf2image import convert_from_bytes
except ImportError as e:
_logger.warning("attachment_to_pages requires PIL + pdf2image: %s", e)
return []
if not attachment or not attachment.datas:
return []
try:
data = base64.b64decode(attachment.datas)
except Exception as e:
_logger.warning("Could not decode attachment %s: %s", attachment.id, e)
return []
mimetype = attachment.mimetype or ''
is_pdf = mimetype == 'application/pdf' or data[:4] == b'%PDF'
try:
if is_pdf:
return convert_from_bytes(data, dpi=200)
return [Image.open(io.BytesIO(data))]
except Exception as e:
_logger.warning("Could not render attachment %s: %s", attachment.id, e)
return []

View File

@@ -0,0 +1,150 @@
"""Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice
fields via the configured LLM provider.
Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py:
look up an adapter by ir.config_parameter, fall back gracefully when no
provider is configured, and never let an LLM hiccup nuke the OCR result.
"""
import json
import logging
_logger = logging.getLogger(__name__)
SYSTEM_PROMPT = (
"You are an invoice field extraction assistant. You read raw OCR text "
"from vendor bills and return a strict JSON object with the requested "
"fields. You never include commentary or markdown fences. When a field "
"cannot be determined from the text you return null for that field."
)
USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object
with these fields (use null when unclear):
{{
"vendor_name": <string, the seller/vendor company name>,
"invoice_number": <string, the bill or invoice reference number>,
"invoice_date": <string, ISO format YYYY-MM-DD>,
"due_date": <string or null, ISO format YYYY-MM-DD>,
"currency": <string, ISO 4217 code like CAD/USD/EUR>,
"subtotal": <number or null>,
"tax_total": <number or null>,
"total": <number, the grand total amount due>,
"line_items": [
{{"description": <string>, "quantity": <number or null>,
"unit_price": <number or null>, "amount": <number or null>}}
]
}}
Return ONLY valid JSON, no commentary, no markdown fences.
Raw OCR text:
---
{text}
---
"""
def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict:
"""Use the configured LLM provider to extract structured invoice fields.
Returns a dict with the schema above. On any failure (no provider, bad
JSON, network error, etc.) returns an all-null result so the OCR raw
text is still preserved for the AP user.
"""
if not raw_text or not raw_text.strip():
return _empty_result()
if provider is None:
provider = _get_provider(env)
if provider is None:
_logger.info(
"No LLM provider configured for OCR field parsing; "
"raw OCR text preserved, fields left empty."
)
return _empty_result()
try:
truncated = raw_text[:12000]
user = USER_PROMPT.format(text=truncated)
response = provider.complete(
system=SYSTEM_PROMPT,
messages=[{'role': 'user', 'content': user}],
max_tokens=1000,
temperature=0.1,
)
content = response.get('content') if isinstance(response, dict) else response
if not content:
return _empty_result()
# LLMs sometimes wrap JSON in ```json ... ``` despite instructions.
content = content.strip()
if content.startswith('```'):
content = content.split('```', 2)[1]
if content.startswith('json'):
content = content[4:]
content = content.rsplit('```', 1)[0]
parsed = json.loads(content.strip())
return {
'vendor_name': parsed.get('vendor_name'),
'invoice_number': parsed.get('invoice_number'),
'invoice_date': parsed.get('invoice_date'),
'due_date': parsed.get('due_date'),
'currency': parsed.get('currency'),
'subtotal': parsed.get('subtotal'),
'tax_total': parsed.get('tax_total'),
'total': parsed.get('total'),
'line_items': parsed.get('line_items') or [],
}
except json.JSONDecodeError as e:
_logger.warning("LLM returned non-JSON for OCR field parsing: %s", e)
return _empty_result()
except Exception as e:
_logger.warning("OCR field parsing failed: %s", e)
return _empty_result()
def _empty_result():
return {
'vendor_name': None,
'invoice_number': None,
'invoice_date': None,
'due_date': None,
'currency': None,
'subtotal': None,
'tax_total': None,
'total': None,
'line_items': [],
}
def _get_provider(env):
"""Look up the LLM adapter via ir.config_parameter.
Honours a feature-specific override
(``fusion_accounting.provider.ocr_field_parsing``) before falling back
to the suite-wide default (``fusion_accounting.provider.default``).
Returns None when no adapter is configured/importable.
"""
param = env['ir.config_parameter'].sudo()
name = param.get_param('fusion_accounting.provider.ocr_field_parsing')
if not name:
name = param.get_param('fusion_accounting.provider.default')
if not name:
return None
try:
from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter
from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter
except ImportError:
return None
try:
if name.startswith('openai'):
return OpenAIAdapter(env)
if name.startswith('claude'):
return ClaudeAdapter(env)
except Exception as e:
_logger.warning("OCR field parser could not instantiate %s: %s", name, e)
return None
return None

View File

@@ -0,0 +1,3 @@
from . import base
from . import tesseract_adapter
from . import manual_adapter

View File

@@ -0,0 +1,40 @@
"""OCRProvider contract - every backend must conform.
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
tesseract adapter without touching account.move.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
@dataclass
class OCRResult:
raw_text: str = ''
confidence: float = 0.0 # 0.01.0
pages: int = 0
backend: str = ''
error: str = ''
metadata: dict = field(default_factory=dict)
class OCRProvider(ABC):
"""Abstract OCR backend. Subclasses implement extract()."""
name: str = 'base'
@abstractmethod
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
"""Extract text from raw bytes.
``mimetype`` hints whether to PDF-render (poppler) or image-decode
(PIL) the bytes. Implementations should still inspect the byte
signature for safety.
"""
...
@classmethod
def is_available(cls) -> bool:
"""Return True if the backend's runtime deps are present."""
return True

View File

@@ -0,0 +1,13 @@
"""Manual fallback adapter - no real OCR, just marks the document as
'awaiting manual entry'. Used when no real OCR backend is available
or when the user explicitly disables OCR.
"""
from .base import OCRProvider, OCRResult
class ManualAdapter(OCRProvider):
name = 'manual'
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')

View File

@@ -0,0 +1,71 @@
"""Tesseract OCR adapter.
Uses the system tesseract binary via pytesseract, with poppler-backed
PDF rendering via pdf2image. Inside the container these are pre-installed:
- tesseract-ocr 5.3.4
- pytesseract 0.3.13
- pdf2image 1.17.0
- poppler-utils
"""
import io
import logging
from .base import OCRProvider, OCRResult
_logger = logging.getLogger(__name__)
class TesseractAdapter(OCRProvider):
name = 'tesseract'
@classmethod
def is_available(cls) -> bool:
try:
import pytesseract
from pdf2image import convert_from_bytes # noqa: F401
from PIL import Image # noqa: F401
pytesseract.get_tesseract_version()
return True
except Exception as e:
_logger.debug("TesseractAdapter not available: %s", e)
return False
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
try:
is_pdf = (
mimetype == 'application/pdf'
or (image_or_pdf_bytes[:4] == b'%PDF')
)
if is_pdf:
pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
else:
img = Image.open(io.BytesIO(image_or_pdf_bytes))
pages = [img]
texts = []
for p in pages:
texts.append(pytesseract.image_to_string(p))
full_text = '\n\f\n'.join(texts)
# Heuristic confidence - tesseract has a per-word conf in
# image_to_data, but a length proxy is fine for routing
# decisions. Future: use pytesseract.image_to_data for a real
# average word-level confidence.
conf = min(1.0, len(full_text) / 1000.0)
return OCRResult(
raw_text=full_text,
confidence=conf,
pages=len(pages),
backend='tesseract',
)
except Exception as e:
_logger.warning("Tesseract OCR failed: %s", e)
return OCRResult(
raw_text='', confidence=0.0, pages=0,
backend='tesseract', error=str(e),
)