"""Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice fields via the configured LLM provider. Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py: look up an adapter by ir.config_parameter, fall back gracefully when no provider is configured, and never let an LLM hiccup nuke the OCR result. """ import json import logging _logger = logging.getLogger(__name__) SYSTEM_PROMPT = ( "You are an invoice field extraction assistant. You read raw OCR text " "from vendor bills and return a strict JSON object with the requested " "fields. You never include commentary or markdown fences. When a field " "cannot be determined from the text you return null for that field." ) USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object with these fields (use null when unclear): {{ "vendor_name": , "invoice_number": , "invoice_date": , "due_date": , "currency": , "subtotal": , "tax_total": , "total": , "line_items": [ {{"description": , "quantity": , "unit_price": , "amount": }} ] }} Return ONLY valid JSON, no commentary, no markdown fences. Raw OCR text: --- {text} --- """ def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict: """Use the configured LLM provider to extract structured invoice fields. Returns a dict with the schema above. On any failure (no provider, bad JSON, network error, etc.) returns an all-null result so the OCR raw text is still preserved for the AP user. """ if not raw_text or not raw_text.strip(): return _empty_result() if provider is None: provider = _get_provider(env) if provider is None: _logger.info( "No LLM provider configured for OCR field parsing; " "raw OCR text preserved, fields left empty." ) return _empty_result() try: truncated = raw_text[:12000] user = USER_PROMPT.format(text=truncated) response = provider.complete( system=SYSTEM_PROMPT, messages=[{'role': 'user', 'content': user}], max_tokens=1000, temperature=0.1, ) content = response.get('content') if isinstance(response, dict) else response if not content: return _empty_result() # LLMs sometimes wrap JSON in ```json ... ``` despite instructions. content = content.strip() if content.startswith('```'): content = content.split('```', 2)[1] if content.startswith('json'): content = content[4:] content = content.rsplit('```', 1)[0] parsed = json.loads(content.strip()) return { 'vendor_name': parsed.get('vendor_name'), 'invoice_number': parsed.get('invoice_number'), 'invoice_date': parsed.get('invoice_date'), 'due_date': parsed.get('due_date'), 'currency': parsed.get('currency'), 'subtotal': parsed.get('subtotal'), 'tax_total': parsed.get('tax_total'), 'total': parsed.get('total'), 'line_items': parsed.get('line_items') or [], } except json.JSONDecodeError as e: _logger.warning("LLM returned non-JSON for OCR field parsing: %s", e) return _empty_result() except Exception as e: _logger.warning("OCR field parsing failed: %s", e) return _empty_result() def _empty_result(): return { 'vendor_name': None, 'invoice_number': None, 'invoice_date': None, 'due_date': None, 'currency': None, 'subtotal': None, 'tax_total': None, 'total': None, 'line_items': [], } def _get_provider(env): """Look up the LLM adapter via ir.config_parameter. Honours a feature-specific override (``fusion_accounting.provider.ocr_field_parsing``) before falling back to the suite-wide default (``fusion_accounting.provider.default``). Returns None when no adapter is configured/importable. """ param = env['ir.config_parameter'].sudo() name = param.get_param('fusion_accounting.provider.ocr_field_parsing') if not name: name = param.get_param('fusion_accounting.provider.default') if not name: return None try: from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter except ImportError: return None try: if name.startswith('openai'): return OpenAIAdapter(env) if name.startswith('claude'): return ClaudeAdapter(env) except Exception as e: _logger.warning("OCR field parser could not instantiate %s: %s", name, e) return None return None