feat(fusion_accounting_ocr): pluggable OCR for vendor bills
Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline: Stage 1 (text extraction): Tesseract OCRs the bill attachment via pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows future Mindee / Google Document AI / Ollama-vision backends. Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the raw OCR text and returns structured invoice fields (vendor, invoice number, dates, amounts, line items) as JSON. Draft invoice fields are auto-populated for empty-only fields (never overwriting user-entered data). Vendor matching by name against res.partner with supplier_rank > 0. Adds: - account.move.ocr_state (selection: not_requested/pending/processing/ done/failed/manual) - account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend, ocr_confidence - fusion.ocr.log (audit trail per OCR run) - res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run - /fusion/ocr/request_for_invoice JSON-RPC endpoint Backend availability detected at runtime via OCRProvider.is_available() classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0 are installed in the container. Tests: 13 (TesseractAdapter availability + image OCR; flow tests for draft autofill, no-attachment guard, customer-invoice guard, ref-not- overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/ provider-exception). All pass on westin-v19 OrbStack VM. Made-with: Cursor
This commit is contained in:
3
fusion_accounting_ocr/services/__init__.py
Normal file
3
fusion_accounting_ocr/services/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from . import ocr_providers
|
||||
from . import attachment_to_image
|
||||
from . import invoice_field_parser
|
||||
43
fusion_accounting_ocr/services/attachment_to_image.py
Normal file
43
fusion_accounting_ocr/services/attachment_to_image.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Helper: turn an ir.attachment into a list of PIL.Image pages.
|
||||
|
||||
Kept separate from the adapters so future backends (Ollama-vision, Mindee)
|
||||
that want PIL images directly don't have to re-implement the PDF rendering.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def attachment_to_pages(attachment):
|
||||
"""Decode an ir.attachment into a list of PIL.Image pages.
|
||||
|
||||
Returns ``[]`` on failure (caller should treat as no pages).
|
||||
"""
|
||||
try:
|
||||
from PIL import Image
|
||||
from pdf2image import convert_from_bytes
|
||||
except ImportError as e:
|
||||
_logger.warning("attachment_to_pages requires PIL + pdf2image: %s", e)
|
||||
return []
|
||||
|
||||
if not attachment or not attachment.datas:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = base64.b64decode(attachment.datas)
|
||||
except Exception as e:
|
||||
_logger.warning("Could not decode attachment %s: %s", attachment.id, e)
|
||||
return []
|
||||
|
||||
mimetype = attachment.mimetype or ''
|
||||
is_pdf = mimetype == 'application/pdf' or data[:4] == b'%PDF'
|
||||
try:
|
||||
if is_pdf:
|
||||
return convert_from_bytes(data, dpi=200)
|
||||
return [Image.open(io.BytesIO(data))]
|
||||
except Exception as e:
|
||||
_logger.warning("Could not render attachment %s: %s", attachment.id, e)
|
||||
return []
|
||||
150
fusion_accounting_ocr/services/invoice_field_parser.py
Normal file
150
fusion_accounting_ocr/services/invoice_field_parser.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice
|
||||
fields via the configured LLM provider.
|
||||
|
||||
Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py:
|
||||
look up an adapter by ir.config_parameter, fall back gracefully when no
|
||||
provider is configured, and never let an LLM hiccup nuke the OCR result.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are an invoice field extraction assistant. You read raw OCR text "
|
||||
"from vendor bills and return a strict JSON object with the requested "
|
||||
"fields. You never include commentary or markdown fences. When a field "
|
||||
"cannot be determined from the text you return null for that field."
|
||||
)
|
||||
|
||||
USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object
|
||||
with these fields (use null when unclear):
|
||||
|
||||
{{
|
||||
"vendor_name": <string, the seller/vendor company name>,
|
||||
"invoice_number": <string, the bill or invoice reference number>,
|
||||
"invoice_date": <string, ISO format YYYY-MM-DD>,
|
||||
"due_date": <string or null, ISO format YYYY-MM-DD>,
|
||||
"currency": <string, ISO 4217 code like CAD/USD/EUR>,
|
||||
"subtotal": <number or null>,
|
||||
"tax_total": <number or null>,
|
||||
"total": <number, the grand total amount due>,
|
||||
"line_items": [
|
||||
{{"description": <string>, "quantity": <number or null>,
|
||||
"unit_price": <number or null>, "amount": <number or null>}}
|
||||
]
|
||||
}}
|
||||
|
||||
Return ONLY valid JSON, no commentary, no markdown fences.
|
||||
|
||||
Raw OCR text:
|
||||
---
|
||||
{text}
|
||||
---
|
||||
"""
|
||||
|
||||
|
||||
def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict:
|
||||
"""Use the configured LLM provider to extract structured invoice fields.
|
||||
|
||||
Returns a dict with the schema above. On any failure (no provider, bad
|
||||
JSON, network error, etc.) returns an all-null result so the OCR raw
|
||||
text is still preserved for the AP user.
|
||||
"""
|
||||
if not raw_text or not raw_text.strip():
|
||||
return _empty_result()
|
||||
|
||||
if provider is None:
|
||||
provider = _get_provider(env)
|
||||
if provider is None:
|
||||
_logger.info(
|
||||
"No LLM provider configured for OCR field parsing; "
|
||||
"raw OCR text preserved, fields left empty."
|
||||
)
|
||||
return _empty_result()
|
||||
|
||||
try:
|
||||
truncated = raw_text[:12000]
|
||||
user = USER_PROMPT.format(text=truncated)
|
||||
response = provider.complete(
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[{'role': 'user', 'content': user}],
|
||||
max_tokens=1000,
|
||||
temperature=0.1,
|
||||
)
|
||||
content = response.get('content') if isinstance(response, dict) else response
|
||||
if not content:
|
||||
return _empty_result()
|
||||
|
||||
# LLMs sometimes wrap JSON in ```json ... ``` despite instructions.
|
||||
content = content.strip()
|
||||
if content.startswith('```'):
|
||||
content = content.split('```', 2)[1]
|
||||
if content.startswith('json'):
|
||||
content = content[4:]
|
||||
content = content.rsplit('```', 1)[0]
|
||||
|
||||
parsed = json.loads(content.strip())
|
||||
return {
|
||||
'vendor_name': parsed.get('vendor_name'),
|
||||
'invoice_number': parsed.get('invoice_number'),
|
||||
'invoice_date': parsed.get('invoice_date'),
|
||||
'due_date': parsed.get('due_date'),
|
||||
'currency': parsed.get('currency'),
|
||||
'subtotal': parsed.get('subtotal'),
|
||||
'tax_total': parsed.get('tax_total'),
|
||||
'total': parsed.get('total'),
|
||||
'line_items': parsed.get('line_items') or [],
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
_logger.warning("LLM returned non-JSON for OCR field parsing: %s", e)
|
||||
return _empty_result()
|
||||
except Exception as e:
|
||||
_logger.warning("OCR field parsing failed: %s", e)
|
||||
return _empty_result()
|
||||
|
||||
|
||||
def _empty_result():
|
||||
return {
|
||||
'vendor_name': None,
|
||||
'invoice_number': None,
|
||||
'invoice_date': None,
|
||||
'due_date': None,
|
||||
'currency': None,
|
||||
'subtotal': None,
|
||||
'tax_total': None,
|
||||
'total': None,
|
||||
'line_items': [],
|
||||
}
|
||||
|
||||
|
||||
def _get_provider(env):
|
||||
"""Look up the LLM adapter via ir.config_parameter.
|
||||
|
||||
Honours a feature-specific override
|
||||
(``fusion_accounting.provider.ocr_field_parsing``) before falling back
|
||||
to the suite-wide default (``fusion_accounting.provider.default``).
|
||||
Returns None when no adapter is configured/importable.
|
||||
"""
|
||||
param = env['ir.config_parameter'].sudo()
|
||||
name = param.get_param('fusion_accounting.provider.ocr_field_parsing')
|
||||
if not name:
|
||||
name = param.get_param('fusion_accounting.provider.default')
|
||||
if not name:
|
||||
return None
|
||||
try:
|
||||
from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter
|
||||
from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter
|
||||
except ImportError:
|
||||
return None
|
||||
try:
|
||||
if name.startswith('openai'):
|
||||
return OpenAIAdapter(env)
|
||||
if name.startswith('claude'):
|
||||
return ClaudeAdapter(env)
|
||||
except Exception as e:
|
||||
_logger.warning("OCR field parser could not instantiate %s: %s", name, e)
|
||||
return None
|
||||
return None
|
||||
3
fusion_accounting_ocr/services/ocr_providers/__init__.py
Normal file
3
fusion_accounting_ocr/services/ocr_providers/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from . import base
|
||||
from . import tesseract_adapter
|
||||
from . import manual_adapter
|
||||
40
fusion_accounting_ocr/services/ocr_providers/base.py
Normal file
40
fusion_accounting_ocr/services/ocr_providers/base.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""OCRProvider contract - every backend must conform.
|
||||
|
||||
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
|
||||
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
|
||||
tesseract adapter without touching account.move.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
raw_text: str = ''
|
||||
confidence: float = 0.0 # 0.0–1.0
|
||||
pages: int = 0
|
||||
backend: str = ''
|
||||
error: str = ''
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class OCRProvider(ABC):
|
||||
"""Abstract OCR backend. Subclasses implement extract()."""
|
||||
|
||||
name: str = 'base'
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
|
||||
"""Extract text from raw bytes.
|
||||
|
||||
``mimetype`` hints whether to PDF-render (poppler) or image-decode
|
||||
(PIL) the bytes. Implementations should still inspect the byte
|
||||
signature for safety.
|
||||
"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
"""Return True if the backend's runtime deps are present."""
|
||||
return True
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Manual fallback adapter - no real OCR, just marks the document as
|
||||
'awaiting manual entry'. Used when no real OCR backend is available
|
||||
or when the user explicitly disables OCR.
|
||||
"""
|
||||
|
||||
from .base import OCRProvider, OCRResult
|
||||
|
||||
|
||||
class ManualAdapter(OCRProvider):
|
||||
name = 'manual'
|
||||
|
||||
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
|
||||
return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')
|
||||
@@ -0,0 +1,71 @@
|
||||
"""Tesseract OCR adapter.
|
||||
|
||||
Uses the system tesseract binary via pytesseract, with poppler-backed
|
||||
PDF rendering via pdf2image. Inside the container these are pre-installed:
|
||||
- tesseract-ocr 5.3.4
|
||||
- pytesseract 0.3.13
|
||||
- pdf2image 1.17.0
|
||||
- poppler-utils
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
|
||||
from .base import OCRProvider, OCRResult
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractAdapter(OCRProvider):
|
||||
name = 'tesseract'
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
try:
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_bytes # noqa: F401
|
||||
from PIL import Image # noqa: F401
|
||||
pytesseract.get_tesseract_version()
|
||||
return True
|
||||
except Exception as e:
|
||||
_logger.debug("TesseractAdapter not available: %s", e)
|
||||
return False
|
||||
|
||||
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_bytes
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
is_pdf = (
|
||||
mimetype == 'application/pdf'
|
||||
or (image_or_pdf_bytes[:4] == b'%PDF')
|
||||
)
|
||||
if is_pdf:
|
||||
pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
|
||||
else:
|
||||
img = Image.open(io.BytesIO(image_or_pdf_bytes))
|
||||
pages = [img]
|
||||
|
||||
texts = []
|
||||
for p in pages:
|
||||
texts.append(pytesseract.image_to_string(p))
|
||||
full_text = '\n\f\n'.join(texts)
|
||||
|
||||
# Heuristic confidence - tesseract has a per-word conf in
|
||||
# image_to_data, but a length proxy is fine for routing
|
||||
# decisions. Future: use pytesseract.image_to_data for a real
|
||||
# average word-level confidence.
|
||||
conf = min(1.0, len(full_text) / 1000.0)
|
||||
return OCRResult(
|
||||
raw_text=full_text,
|
||||
confidence=conf,
|
||||
pages=len(pages),
|
||||
backend='tesseract',
|
||||
)
|
||||
except Exception as e:
|
||||
_logger.warning("Tesseract OCR failed: %s", e)
|
||||
return OCRResult(
|
||||
raw_text='', confidence=0.0, pages=0,
|
||||
backend='tesseract', error=str(e),
|
||||
)
|
||||
Reference in New Issue
Block a user