Files
Odoo-Modules/fusion_accounting_ocr/services/invoice_field_parser.py
gsinghpal 125f48377a feat(fusion_accounting_ocr): pluggable OCR for vendor bills
Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline:

Stage 1 (text extraction): Tesseract OCRs the bill attachment via
pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows
future Mindee / Google Document AI / Ollama-vision backends.

Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the
raw OCR text and returns structured invoice fields (vendor, invoice
number, dates, amounts, line items) as JSON.

Draft invoice fields are auto-populated for empty-only fields (never
overwriting user-entered data). Vendor matching by name against
res.partner with supplier_rank > 0.

Adds:
- account.move.ocr_state (selection: not_requested/pending/processing/
  done/failed/manual)
- account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend,
  ocr_confidence
- fusion.ocr.log (audit trail per OCR run)
- res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run
- /fusion/ocr/request_for_invoice JSON-RPC endpoint

Backend availability detected at runtime via OCRProvider.is_available()
classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0
are installed in the container.

Tests: 13 (TesseractAdapter availability + image OCR; flow tests for
draft autofill, no-attachment guard, customer-invoice guard, ref-not-
overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/
provider-exception). All pass on westin-v19 OrbStack VM.

Made-with: Cursor
2026-04-20 00:32:50 -04:00

151 lines
5.1 KiB
Python

"""Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice
fields via the configured LLM provider.
Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py:
look up an adapter by ir.config_parameter, fall back gracefully when no
provider is configured, and never let an LLM hiccup nuke the OCR result.
"""
import json
import logging
_logger = logging.getLogger(__name__)
SYSTEM_PROMPT = (
"You are an invoice field extraction assistant. You read raw OCR text "
"from vendor bills and return a strict JSON object with the requested "
"fields. You never include commentary or markdown fences. When a field "
"cannot be determined from the text you return null for that field."
)
USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object
with these fields (use null when unclear):
{{
"vendor_name": <string, the seller/vendor company name>,
"invoice_number": <string, the bill or invoice reference number>,
"invoice_date": <string, ISO format YYYY-MM-DD>,
"due_date": <string or null, ISO format YYYY-MM-DD>,
"currency": <string, ISO 4217 code like CAD/USD/EUR>,
"subtotal": <number or null>,
"tax_total": <number or null>,
"total": <number, the grand total amount due>,
"line_items": [
{{"description": <string>, "quantity": <number or null>,
"unit_price": <number or null>, "amount": <number or null>}}
]
}}
Return ONLY valid JSON, no commentary, no markdown fences.
Raw OCR text:
---
{text}
---
"""
def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict:
"""Use the configured LLM provider to extract structured invoice fields.
Returns a dict with the schema above. On any failure (no provider, bad
JSON, network error, etc.) returns an all-null result so the OCR raw
text is still preserved for the AP user.
"""
if not raw_text or not raw_text.strip():
return _empty_result()
if provider is None:
provider = _get_provider(env)
if provider is None:
_logger.info(
"No LLM provider configured for OCR field parsing; "
"raw OCR text preserved, fields left empty."
)
return _empty_result()
try:
truncated = raw_text[:12000]
user = USER_PROMPT.format(text=truncated)
response = provider.complete(
system=SYSTEM_PROMPT,
messages=[{'role': 'user', 'content': user}],
max_tokens=1000,
temperature=0.1,
)
content = response.get('content') if isinstance(response, dict) else response
if not content:
return _empty_result()
# LLMs sometimes wrap JSON in ```json ... ``` despite instructions.
content = content.strip()
if content.startswith('```'):
content = content.split('```', 2)[1]
if content.startswith('json'):
content = content[4:]
content = content.rsplit('```', 1)[0]
parsed = json.loads(content.strip())
return {
'vendor_name': parsed.get('vendor_name'),
'invoice_number': parsed.get('invoice_number'),
'invoice_date': parsed.get('invoice_date'),
'due_date': parsed.get('due_date'),
'currency': parsed.get('currency'),
'subtotal': parsed.get('subtotal'),
'tax_total': parsed.get('tax_total'),
'total': parsed.get('total'),
'line_items': parsed.get('line_items') or [],
}
except json.JSONDecodeError as e:
_logger.warning("LLM returned non-JSON for OCR field parsing: %s", e)
return _empty_result()
except Exception as e:
_logger.warning("OCR field parsing failed: %s", e)
return _empty_result()
def _empty_result():
return {
'vendor_name': None,
'invoice_number': None,
'invoice_date': None,
'due_date': None,
'currency': None,
'subtotal': None,
'tax_total': None,
'total': None,
'line_items': [],
}
def _get_provider(env):
"""Look up the LLM adapter via ir.config_parameter.
Honours a feature-specific override
(``fusion_accounting.provider.ocr_field_parsing``) before falling back
to the suite-wide default (``fusion_accounting.provider.default``).
Returns None when no adapter is configured/importable.
"""
param = env['ir.config_parameter'].sudo()
name = param.get_param('fusion_accounting.provider.ocr_field_parsing')
if not name:
name = param.get_param('fusion_accounting.provider.default')
if not name:
return None
try:
from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter
from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter
except ImportError:
return None
try:
if name.startswith('openai'):
return OpenAIAdapter(env)
if name.startswith('claude'):
return ClaudeAdapter(env)
except Exception as e:
_logger.warning("OCR field parser could not instantiate %s: %s", name, e)
return None
return None