"""account.move OCR extension. Adds an OCR pipeline triggered manually (or, optionally, automatically when a PDF/image is attached). Stage 1 is tesseract text extraction; stage 2 is LLM field parsing through the existing fusion_accounting_ai adapter stack. """ import base64 import logging from odoo import _, fields, models from odoo.exceptions import UserError from ..services.ocr_providers.tesseract_adapter import TesseractAdapter from ..services.ocr_providers.manual_adapter import ManualAdapter from ..services.invoice_field_parser import parse_invoice_fields _logger = logging.getLogger(__name__) SUPPORTED_MIMETYPES = ( 'application/pdf', 'image/png', 'image/jpeg', 'image/jpg', ) class AccountMove(models.Model): _inherit = 'account.move' ocr_state = fields.Selection( [ ('not_requested', 'Not Requested'), ('pending', 'Pending'), ('processing', 'Processing'), ('done', 'Done'), ('failed', 'Failed'), ('manual', 'Manual Entry'), ], default='not_requested', copy=False, tracking=True, ) ocr_raw_text = fields.Text( string='OCR Raw Text', readonly=True, copy=False, help="Raw text extracted by the OCR backend.", ) ocr_extracted_data = fields.Json( string='OCR Extracted Fields', readonly=True, copy=False, help="Structured invoice fields parsed from the OCR text by the LLM.", ) ocr_backend = fields.Char(string='OCR Backend Used', readonly=True, copy=False) ocr_confidence = fields.Float(string='OCR Confidence', readonly=True, copy=False) ocr_log_ids = fields.One2many('fusion.ocr.log', 'move_id', string='OCR Runs') def action_request_ocr(self): """Run OCR on the most recent supported attachment of each move.""" for move in self: if move.move_type not in ('in_invoice', 'in_refund'): raise UserError(_("OCR currently supports vendor bills only.")) attachment = self.env['ir.attachment'].sudo().search( [ ('res_model', '=', 'account.move'), ('res_id', '=', move.id), ('mimetype', 'in', SUPPORTED_MIMETYPES), ], order='create_date desc', limit=1, ) if not attachment: raise UserError( _("No PDF or image attachment found on %s") % (move.name or move.id) ) move._fusion_run_ocr(attachment) return True def _fusion_run_ocr(self, attachment): self.ensure_one() self.ocr_state = 'processing' backend_name = ( self.company_id.fusion_ocr_default_backend if 'fusion_ocr_default_backend' in self.company_id._fields else 'tesseract' ) provider = self._fusion_get_ocr_provider(backend_name) if not provider: self.ocr_state = 'manual' self.message_post( body=_("No OCR backend available; falling back to manual entry.") ) return False try: data = base64.b64decode(attachment.datas) result = provider.extract( data, mimetype=attachment.mimetype or 'application/pdf' ) self.write({ 'ocr_raw_text': result.raw_text, 'ocr_backend': result.backend, 'ocr_confidence': result.confidence, }) self.env['fusion.ocr.log'].sudo().create({ 'move_id': self.id, 'backend': result.backend, 'confidence': result.confidence, 'raw_text_length': len(result.raw_text or ''), 'pages': result.pages, 'error': result.error, }) if not result.raw_text and result.error: self.ocr_state = 'failed' self.message_post(body=_("OCR failed: %s") % result.error) return False parsed = parse_invoice_fields(self.env, result.raw_text) self.ocr_extracted_data = parsed self.ocr_state = 'done' self._fusion_apply_ocr_fields(parsed) self.message_post( body=_("OCR complete: %s confidence %.0f%%") % ( result.backend, (result.confidence or 0) * 100, ) ) return True except Exception as e: _logger.exception("OCR run failed for move %s", self.id) self.ocr_state = 'failed' self.message_post(body=_("OCR error: %s") % e) return False def _fusion_get_ocr_provider(self, backend_name): if backend_name == 'tesseract' and TesseractAdapter.is_available(): return TesseractAdapter() if backend_name == 'manual': return ManualAdapter() # Future adapters (mindee, google_doc_ai, ollama_vision) plug in # here. Fall back to whichever adapter is actually usable. if TesseractAdapter.is_available(): return TesseractAdapter() return ManualAdapter() def _fusion_apply_ocr_fields(self, parsed): """Apply parsed fields to a draft invoice without overwriting any user-entered data. No-op on posted/cancelled invoices.""" if self.state != 'draft': return vals = {} if parsed.get('invoice_date') and not self.invoice_date: try: vals['invoice_date'] = parsed['invoice_date'] except Exception: pass if parsed.get('due_date') and not self.invoice_date_due: try: vals['invoice_date_due'] = parsed['due_date'] except Exception: pass if parsed.get('invoice_number') and not self.ref: vals['ref'] = parsed['invoice_number'] # Vendor: best-effort name match against existing supplier partners. # Never auto-create a partner; AP user confirms ambiguous matches. if parsed.get('vendor_name') and not self.partner_id: partner = self.env['res.partner'].sudo().search( [ ('name', '=ilike', parsed['vendor_name']), ('supplier_rank', '>', 0), ], limit=1, ) if partner: vals['partner_id'] = partner.id if vals: self.write(vals)