feat(fusion_accounting_ocr): pluggable OCR for vendor bills
Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline: Stage 1 (text extraction): Tesseract OCRs the bill attachment via pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows future Mindee / Google Document AI / Ollama-vision backends. Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the raw OCR text and returns structured invoice fields (vendor, invoice number, dates, amounts, line items) as JSON. Draft invoice fields are auto-populated for empty-only fields (never overwriting user-entered data). Vendor matching by name against res.partner with supplier_rank > 0. Adds: - account.move.ocr_state (selection: not_requested/pending/processing/ done/failed/manual) - account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend, ocr_confidence - fusion.ocr.log (audit trail per OCR run) - res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run - /fusion/ocr/request_for_invoice JSON-RPC endpoint Backend availability detected at runtime via OCRProvider.is_available() classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0 are installed in the container. Tests: 13 (TesseractAdapter availability + image OCR; flow tests for draft autofill, no-attachment guard, customer-invoice guard, ref-not- overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/ provider-exception). All pass on westin-v19 OrbStack VM. Made-with: Cursor
This commit is contained in:
4
fusion_accounting_ocr/models/__init__.py
Normal file
4
fusion_accounting_ocr/models/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from . import fusion_ocr_log
|
||||
from . import res_company
|
||||
from . import res_config_settings
|
||||
from . import account_move
|
||||
180
fusion_accounting_ocr/models/account_move.py
Normal file
180
fusion_accounting_ocr/models/account_move.py
Normal file
@@ -0,0 +1,180 @@
|
||||
"""account.move OCR extension.
|
||||
|
||||
Adds an OCR pipeline triggered manually (or, optionally, automatically when
|
||||
a PDF/image is attached). Stage 1 is tesseract text extraction; stage 2 is
|
||||
LLM field parsing through the existing fusion_accounting_ai adapter stack.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from odoo import _, fields, models
|
||||
from odoo.exceptions import UserError
|
||||
|
||||
from ..services.ocr_providers.tesseract_adapter import TesseractAdapter
|
||||
from ..services.ocr_providers.manual_adapter import ManualAdapter
|
||||
from ..services.invoice_field_parser import parse_invoice_fields
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SUPPORTED_MIMETYPES = (
|
||||
'application/pdf', 'image/png', 'image/jpeg', 'image/jpg',
|
||||
)
|
||||
|
||||
|
||||
class AccountMove(models.Model):
|
||||
_inherit = 'account.move'
|
||||
|
||||
ocr_state = fields.Selection(
|
||||
[
|
||||
('not_requested', 'Not Requested'),
|
||||
('pending', 'Pending'),
|
||||
('processing', 'Processing'),
|
||||
('done', 'Done'),
|
||||
('failed', 'Failed'),
|
||||
('manual', 'Manual Entry'),
|
||||
],
|
||||
default='not_requested',
|
||||
copy=False,
|
||||
tracking=True,
|
||||
)
|
||||
|
||||
ocr_raw_text = fields.Text(
|
||||
string='OCR Raw Text', readonly=True, copy=False,
|
||||
help="Raw text extracted by the OCR backend.",
|
||||
)
|
||||
ocr_extracted_data = fields.Json(
|
||||
string='OCR Extracted Fields', readonly=True, copy=False,
|
||||
help="Structured invoice fields parsed from the OCR text by the LLM.",
|
||||
)
|
||||
ocr_backend = fields.Char(string='OCR Backend Used', readonly=True, copy=False)
|
||||
ocr_confidence = fields.Float(string='OCR Confidence', readonly=True, copy=False)
|
||||
ocr_log_ids = fields.One2many('fusion.ocr.log', 'move_id', string='OCR Runs')
|
||||
|
||||
def action_request_ocr(self):
|
||||
"""Run OCR on the most recent supported attachment of each move."""
|
||||
for move in self:
|
||||
if move.move_type not in ('in_invoice', 'in_refund'):
|
||||
raise UserError(_("OCR currently supports vendor bills only."))
|
||||
attachment = self.env['ir.attachment'].sudo().search(
|
||||
[
|
||||
('res_model', '=', 'account.move'),
|
||||
('res_id', '=', move.id),
|
||||
('mimetype', 'in', SUPPORTED_MIMETYPES),
|
||||
],
|
||||
order='create_date desc',
|
||||
limit=1,
|
||||
)
|
||||
if not attachment:
|
||||
raise UserError(
|
||||
_("No PDF or image attachment found on %s") % (move.name or move.id)
|
||||
)
|
||||
move._fusion_run_ocr(attachment)
|
||||
return True
|
||||
|
||||
def _fusion_run_ocr(self, attachment):
|
||||
self.ensure_one()
|
||||
self.ocr_state = 'processing'
|
||||
|
||||
backend_name = (
|
||||
self.company_id.fusion_ocr_default_backend
|
||||
if 'fusion_ocr_default_backend' in self.company_id._fields
|
||||
else 'tesseract'
|
||||
)
|
||||
provider = self._fusion_get_ocr_provider(backend_name)
|
||||
if not provider:
|
||||
self.ocr_state = 'manual'
|
||||
self.message_post(
|
||||
body=_("No OCR backend available; falling back to manual entry.")
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
data = base64.b64decode(attachment.datas)
|
||||
result = provider.extract(
|
||||
data, mimetype=attachment.mimetype or 'application/pdf'
|
||||
)
|
||||
|
||||
self.write({
|
||||
'ocr_raw_text': result.raw_text,
|
||||
'ocr_backend': result.backend,
|
||||
'ocr_confidence': result.confidence,
|
||||
})
|
||||
self.env['fusion.ocr.log'].sudo().create({
|
||||
'move_id': self.id,
|
||||
'backend': result.backend,
|
||||
'confidence': result.confidence,
|
||||
'raw_text_length': len(result.raw_text or ''),
|
||||
'pages': result.pages,
|
||||
'error': result.error,
|
||||
})
|
||||
|
||||
if not result.raw_text and result.error:
|
||||
self.ocr_state = 'failed'
|
||||
self.message_post(body=_("OCR failed: %s") % result.error)
|
||||
return False
|
||||
|
||||
parsed = parse_invoice_fields(self.env, result.raw_text)
|
||||
self.ocr_extracted_data = parsed
|
||||
self.ocr_state = 'done'
|
||||
|
||||
self._fusion_apply_ocr_fields(parsed)
|
||||
self.message_post(
|
||||
body=_("OCR complete: %s confidence %.0f%%") % (
|
||||
result.backend, (result.confidence or 0) * 100,
|
||||
)
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
_logger.exception("OCR run failed for move %s", self.id)
|
||||
self.ocr_state = 'failed'
|
||||
self.message_post(body=_("OCR error: %s") % e)
|
||||
return False
|
||||
|
||||
def _fusion_get_ocr_provider(self, backend_name):
|
||||
if backend_name == 'tesseract' and TesseractAdapter.is_available():
|
||||
return TesseractAdapter()
|
||||
if backend_name == 'manual':
|
||||
return ManualAdapter()
|
||||
# Future adapters (mindee, google_doc_ai, ollama_vision) plug in
|
||||
# here. Fall back to whichever adapter is actually usable.
|
||||
if TesseractAdapter.is_available():
|
||||
return TesseractAdapter()
|
||||
return ManualAdapter()
|
||||
|
||||
def _fusion_apply_ocr_fields(self, parsed):
|
||||
"""Apply parsed fields to a draft invoice without overwriting any
|
||||
user-entered data. No-op on posted/cancelled invoices."""
|
||||
if self.state != 'draft':
|
||||
return
|
||||
|
||||
vals = {}
|
||||
if parsed.get('invoice_date') and not self.invoice_date:
|
||||
try:
|
||||
vals['invoice_date'] = parsed['invoice_date']
|
||||
except Exception:
|
||||
pass
|
||||
if parsed.get('due_date') and not self.invoice_date_due:
|
||||
try:
|
||||
vals['invoice_date_due'] = parsed['due_date']
|
||||
except Exception:
|
||||
pass
|
||||
if parsed.get('invoice_number') and not self.ref:
|
||||
vals['ref'] = parsed['invoice_number']
|
||||
|
||||
# Vendor: best-effort name match against existing supplier partners.
|
||||
# Never auto-create a partner; AP user confirms ambiguous matches.
|
||||
if parsed.get('vendor_name') and not self.partner_id:
|
||||
partner = self.env['res.partner'].sudo().search(
|
||||
[
|
||||
('name', '=ilike', parsed['vendor_name']),
|
||||
('supplier_rank', '>', 0),
|
||||
],
|
||||
limit=1,
|
||||
)
|
||||
if partner:
|
||||
vals['partner_id'] = partner.id
|
||||
|
||||
if vals:
|
||||
self.write(vals)
|
||||
17
fusion_accounting_ocr/models/fusion_ocr_log.py
Normal file
17
fusion_accounting_ocr/models/fusion_ocr_log.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from odoo import fields, models
|
||||
|
||||
|
||||
class FusionOcrLog(models.Model):
|
||||
_name = 'fusion.ocr.log'
|
||||
_description = 'Fusion OCR Run Log'
|
||||
_order = 'create_date desc'
|
||||
|
||||
move_id = fields.Many2one(
|
||||
'account.move', required=True, ondelete='cascade', index=True,
|
||||
)
|
||||
backend = fields.Char(required=True)
|
||||
confidence = fields.Float()
|
||||
raw_text_length = fields.Integer()
|
||||
pages = fields.Integer()
|
||||
error = fields.Text()
|
||||
create_date = fields.Datetime(readonly=True)
|
||||
26
fusion_accounting_ocr/models/res_company.py
Normal file
26
fusion_accounting_ocr/models/res_company.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from odoo import fields, models
|
||||
|
||||
|
||||
class ResCompany(models.Model):
|
||||
_inherit = 'res.company'
|
||||
|
||||
fusion_ocr_enabled = fields.Boolean(
|
||||
string='Enable Invoice OCR',
|
||||
default=False,
|
||||
help="When enabled, vendor bill attachments can be OCR'd via the "
|
||||
"configured backend.",
|
||||
)
|
||||
fusion_ocr_default_backend = fields.Selection(
|
||||
[
|
||||
('tesseract', 'Tesseract (local, free)'),
|
||||
('manual', 'Manual entry only'),
|
||||
],
|
||||
default='tesseract',
|
||||
string='Default OCR Backend',
|
||||
)
|
||||
fusion_ocr_auto_run = fields.Boolean(
|
||||
string='Auto-run OCR on attachment',
|
||||
default=False,
|
||||
help="When enabled, OCR runs automatically when a PDF/image is "
|
||||
"attached to a vendor bill.",
|
||||
)
|
||||
15
fusion_accounting_ocr/models/res_config_settings.py
Normal file
15
fusion_accounting_ocr/models/res_config_settings.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from odoo import fields, models
|
||||
|
||||
|
||||
class ResConfigSettings(models.TransientModel):
|
||||
_inherit = 'res.config.settings'
|
||||
|
||||
fusion_ocr_enabled = fields.Boolean(
|
||||
related='company_id.fusion_ocr_enabled', readonly=False,
|
||||
)
|
||||
fusion_ocr_default_backend = fields.Selection(
|
||||
related='company_id.fusion_ocr_default_backend', readonly=False,
|
||||
)
|
||||
fusion_ocr_auto_run = fields.Boolean(
|
||||
related='company_id.fusion_ocr_auto_run', readonly=False,
|
||||
)
|
||||
Reference in New Issue
Block a user