Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline: Stage 1 (text extraction): Tesseract OCRs the bill attachment via pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows future Mindee / Google Document AI / Ollama-vision backends. Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the raw OCR text and returns structured invoice fields (vendor, invoice number, dates, amounts, line items) as JSON. Draft invoice fields are auto-populated for empty-only fields (never overwriting user-entered data). Vendor matching by name against res.partner with supplier_rank > 0. Adds: - account.move.ocr_state (selection: not_requested/pending/processing/ done/failed/manual) - account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend, ocr_confidence - fusion.ocr.log (audit trail per OCR run) - res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run - /fusion/ocr/request_for_invoice JSON-RPC endpoint Backend availability detected at runtime via OCRProvider.is_available() classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0 are installed in the container. Tests: 13 (TesseractAdapter availability + image OCR; flow tests for draft autofill, no-attachment guard, customer-invoice guard, ref-not- overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/ provider-exception). All pass on westin-v19 OrbStack VM. Made-with: Cursor
181 lines
6.4 KiB
Python
181 lines
6.4 KiB
Python
"""account.move OCR extension.
|
|
|
|
Adds an OCR pipeline triggered manually (or, optionally, automatically when
|
|
a PDF/image is attached). Stage 1 is tesseract text extraction; stage 2 is
|
|
LLM field parsing through the existing fusion_accounting_ai adapter stack.
|
|
"""
|
|
|
|
import base64
|
|
import logging
|
|
|
|
from odoo import _, fields, models
|
|
from odoo.exceptions import UserError
|
|
|
|
from ..services.ocr_providers.tesseract_adapter import TesseractAdapter
|
|
from ..services.ocr_providers.manual_adapter import ManualAdapter
|
|
from ..services.invoice_field_parser import parse_invoice_fields
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
SUPPORTED_MIMETYPES = (
|
|
'application/pdf', 'image/png', 'image/jpeg', 'image/jpg',
|
|
)
|
|
|
|
|
|
class AccountMove(models.Model):
|
|
_inherit = 'account.move'
|
|
|
|
ocr_state = fields.Selection(
|
|
[
|
|
('not_requested', 'Not Requested'),
|
|
('pending', 'Pending'),
|
|
('processing', 'Processing'),
|
|
('done', 'Done'),
|
|
('failed', 'Failed'),
|
|
('manual', 'Manual Entry'),
|
|
],
|
|
default='not_requested',
|
|
copy=False,
|
|
tracking=True,
|
|
)
|
|
|
|
ocr_raw_text = fields.Text(
|
|
string='OCR Raw Text', readonly=True, copy=False,
|
|
help="Raw text extracted by the OCR backend.",
|
|
)
|
|
ocr_extracted_data = fields.Json(
|
|
string='OCR Extracted Fields', readonly=True, copy=False,
|
|
help="Structured invoice fields parsed from the OCR text by the LLM.",
|
|
)
|
|
ocr_backend = fields.Char(string='OCR Backend Used', readonly=True, copy=False)
|
|
ocr_confidence = fields.Float(string='OCR Confidence', readonly=True, copy=False)
|
|
ocr_log_ids = fields.One2many('fusion.ocr.log', 'move_id', string='OCR Runs')
|
|
|
|
def action_request_ocr(self):
|
|
"""Run OCR on the most recent supported attachment of each move."""
|
|
for move in self:
|
|
if move.move_type not in ('in_invoice', 'in_refund'):
|
|
raise UserError(_("OCR currently supports vendor bills only."))
|
|
attachment = self.env['ir.attachment'].sudo().search(
|
|
[
|
|
('res_model', '=', 'account.move'),
|
|
('res_id', '=', move.id),
|
|
('mimetype', 'in', SUPPORTED_MIMETYPES),
|
|
],
|
|
order='create_date desc',
|
|
limit=1,
|
|
)
|
|
if not attachment:
|
|
raise UserError(
|
|
_("No PDF or image attachment found on %s") % (move.name or move.id)
|
|
)
|
|
move._fusion_run_ocr(attachment)
|
|
return True
|
|
|
|
def _fusion_run_ocr(self, attachment):
|
|
self.ensure_one()
|
|
self.ocr_state = 'processing'
|
|
|
|
backend_name = (
|
|
self.company_id.fusion_ocr_default_backend
|
|
if 'fusion_ocr_default_backend' in self.company_id._fields
|
|
else 'tesseract'
|
|
)
|
|
provider = self._fusion_get_ocr_provider(backend_name)
|
|
if not provider:
|
|
self.ocr_state = 'manual'
|
|
self.message_post(
|
|
body=_("No OCR backend available; falling back to manual entry.")
|
|
)
|
|
return False
|
|
|
|
try:
|
|
data = base64.b64decode(attachment.datas)
|
|
result = provider.extract(
|
|
data, mimetype=attachment.mimetype or 'application/pdf'
|
|
)
|
|
|
|
self.write({
|
|
'ocr_raw_text': result.raw_text,
|
|
'ocr_backend': result.backend,
|
|
'ocr_confidence': result.confidence,
|
|
})
|
|
self.env['fusion.ocr.log'].sudo().create({
|
|
'move_id': self.id,
|
|
'backend': result.backend,
|
|
'confidence': result.confidence,
|
|
'raw_text_length': len(result.raw_text or ''),
|
|
'pages': result.pages,
|
|
'error': result.error,
|
|
})
|
|
|
|
if not result.raw_text and result.error:
|
|
self.ocr_state = 'failed'
|
|
self.message_post(body=_("OCR failed: %s") % result.error)
|
|
return False
|
|
|
|
parsed = parse_invoice_fields(self.env, result.raw_text)
|
|
self.ocr_extracted_data = parsed
|
|
self.ocr_state = 'done'
|
|
|
|
self._fusion_apply_ocr_fields(parsed)
|
|
self.message_post(
|
|
body=_("OCR complete: %s confidence %.0f%%") % (
|
|
result.backend, (result.confidence or 0) * 100,
|
|
)
|
|
)
|
|
return True
|
|
except Exception as e:
|
|
_logger.exception("OCR run failed for move %s", self.id)
|
|
self.ocr_state = 'failed'
|
|
self.message_post(body=_("OCR error: %s") % e)
|
|
return False
|
|
|
|
def _fusion_get_ocr_provider(self, backend_name):
|
|
if backend_name == 'tesseract' and TesseractAdapter.is_available():
|
|
return TesseractAdapter()
|
|
if backend_name == 'manual':
|
|
return ManualAdapter()
|
|
# Future adapters (mindee, google_doc_ai, ollama_vision) plug in
|
|
# here. Fall back to whichever adapter is actually usable.
|
|
if TesseractAdapter.is_available():
|
|
return TesseractAdapter()
|
|
return ManualAdapter()
|
|
|
|
def _fusion_apply_ocr_fields(self, parsed):
|
|
"""Apply parsed fields to a draft invoice without overwriting any
|
|
user-entered data. No-op on posted/cancelled invoices."""
|
|
if self.state != 'draft':
|
|
return
|
|
|
|
vals = {}
|
|
if parsed.get('invoice_date') and not self.invoice_date:
|
|
try:
|
|
vals['invoice_date'] = parsed['invoice_date']
|
|
except Exception:
|
|
pass
|
|
if parsed.get('due_date') and not self.invoice_date_due:
|
|
try:
|
|
vals['invoice_date_due'] = parsed['due_date']
|
|
except Exception:
|
|
pass
|
|
if parsed.get('invoice_number') and not self.ref:
|
|
vals['ref'] = parsed['invoice_number']
|
|
|
|
# Vendor: best-effort name match against existing supplier partners.
|
|
# Never auto-create a partner; AP user confirms ambiguous matches.
|
|
if parsed.get('vendor_name') and not self.partner_id:
|
|
partner = self.env['res.partner'].sudo().search(
|
|
[
|
|
('name', '=ilike', parsed['vendor_name']),
|
|
('supplier_rank', '>', 0),
|
|
],
|
|
limit=1,
|
|
)
|
|
if partner:
|
|
vals['partner_id'] = partner.id
|
|
|
|
if vals:
|
|
self.write(vals)
|