feat(fusion_accounting_ocr): pluggable OCR for vendor bills

Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline: Stage 1 (text extraction): Tesseract OCRs the bill attachment via pytesseract + pdf2image. Pluggable OCRProvider adapter pattern allows future Mindee / Google Document AI / Ollama-vision backends. Stage 2 (field parsing): The fusion_accounting_ai LLMProvider reads the raw OCR text and returns structured invoice fields (vendor, invoice number, dates, amounts, line items) as JSON. Draft invoice fields are auto-populated for empty-only fields (never overwriting user-entered data). Vendor matching by name against res.partner with supplier_rank > 0. Adds: - account.move.ocr_state (selection: not_requested/pending/processing/ done/failed/manual) - account.move.ocr_raw_text, ocr_extracted_data (Json), ocr_backend, ocr_confidence - fusion.ocr.log (audit trail per OCR run) - res.company.fusion_ocr_enabled / fusion_ocr_default_backend / auto_run - /fusion/ocr/request_for_invoice JSON-RPC endpoint Backend availability detected at runtime via OCRProvider.is_available() classmethods. Tesseract 5.3.4 + pytesseract 0.3.13 + pdf2image 1.17.0 are installed in the container. Tests: 13 (TesseractAdapter availability + image OCR; flow tests for draft autofill, no-attachment guard, customer-invoice guard, ref-not- overwritten; field parser empty/clean-json/markdown-fence/bad-JSON/ provider-exception). All pass on westin-v19 OrbStack VM. Made-with: Cursor
2026-04-20 00:32:50 -04:00
parent a730942d24
commit 125f48377a
24 changed files with 952 additions and 0 deletions
--- a/fusion_accounting_ocr/init.py
+++ b/fusion_accounting_ocr/init.py
@@ -0,0 +1,2 @@
 from . import models
 from . import controllers
--- a/fusion_accounting_ocr/manifest.py
+++ b/fusion_accounting_ocr/manifest.py
@@ -0,0 +1,39 @@
 {
    'name': 'Fusion Accounting — Invoice OCR',
    'version': '19.0.1.0.0',
    'category': 'Accounting/Accounting',
    'summary': 'OCR for vendor bills via tesseract + LLM-driven field extraction.',
    'description': """
 Fusion Accounting — Invoice OCR
 ================================
 Replaces Enterprise's account_invoice_extract with a Fusion-native pipeline:
 1. Tesseract OCRs the bill attachment (PDF or image) into raw text
 2. The fusion_accounting_ai LLMProvider parses the raw text into structured
   fields (vendor, invoice number, dates, amounts, line items)
 3. Draft invoice fields are populated for the AP user to confirm
 Pluggable backend architecture: future Mindee, Google Document AI, or
 Ollama-vision adapters can be dropped in alongside the default tesseract
 adapter.
 """,
    'icon': '/fusion_accounting_ocr/static/description/icon.png',
    'author': 'Westin / Fusion Suite',
    'depends': [
        'fusion_accounting_core',
        'fusion_accounting_ai',
        'account',
    ],
    'external_dependencies': {
        'python': ['pytesseract', 'pdf2image', 'PIL'],
    },
    'data': [
        'security/ir.model.access.csv',
        'views/account_move_views.xml',
        'views/res_config_settings_views.xml',
    ],
    'auto_install': False,
    'installable': True,
    'application': False,
    'license': 'LGPL-3',
 }
--- a/fusion_accounting_ocr/controllers/init.py
+++ b/fusion_accounting_ocr/controllers/init.py
@@ -0,0 +1 @@
 from . import ocr_controller
--- a/fusion_accounting_ocr/controllers/ocr_controller.py
+++ b/fusion_accounting_ocr/controllers/ocr_controller.py
@@ -0,0 +1,21 @@
 from odoo import http
 from odoo.http import request
 class FusionOcrController(http.Controller):
    @http.route('/fusion/ocr/request_for_invoice', type='jsonrpc', auth='user')
    def request_for_invoice(self, move_id):
        move = request.env['account.move'].browse(int(move_id))
        move.check_access('write')
        try:
            move.action_request_ocr()
            return {
                'status': 'ok',
                'state': move.ocr_state,
                'backend': move.ocr_backend,
                'confidence': move.ocr_confidence,
                'extracted': move.ocr_extracted_data,
            }
        except Exception as e:
            return {'status': 'error', 'message': str(e)}
--- a/fusion_accounting_ocr/models/init.py
+++ b/fusion_accounting_ocr/models/init.py
@@ -0,0 +1,4 @@
 from . import fusion_ocr_log
 from . import res_company
 from . import res_config_settings
 from . import account_move
--- a/fusion_accounting_ocr/models/account_move.py
+++ b/fusion_accounting_ocr/models/account_move.py
@@ -0,0 +1,180 @@
 """account.move OCR extension.
 Adds an OCR pipeline triggered manually (or, optionally, automatically when
 a PDF/image is attached). Stage 1 is tesseract text extraction; stage 2 is
 LLM field parsing through the existing fusion_accounting_ai adapter stack.
 """
 import base64
 import logging
 from odoo import _, fields, models
 from odoo.exceptions import UserError
 from ..services.ocr_providers.tesseract_adapter import TesseractAdapter
 from ..services.ocr_providers.manual_adapter import ManualAdapter
 from ..services.invoice_field_parser import parse_invoice_fields
 _logger = logging.getLogger(__name__)
 SUPPORTED_MIMETYPES = (
    'application/pdf', 'image/png', 'image/jpeg', 'image/jpg',
 )
 class AccountMove(models.Model):
    _inherit = 'account.move'
    ocr_state = fields.Selection(
        [
            ('not_requested', 'Not Requested'),
            ('pending', 'Pending'),
            ('processing', 'Processing'),
            ('done', 'Done'),
            ('failed', 'Failed'),
            ('manual', 'Manual Entry'),
        ],
        default='not_requested',
        copy=False,
        tracking=True,
    )
    ocr_raw_text = fields.Text(
        string='OCR Raw Text', readonly=True, copy=False,
        help="Raw text extracted by the OCR backend.",
    )
    ocr_extracted_data = fields.Json(
        string='OCR Extracted Fields', readonly=True, copy=False,
        help="Structured invoice fields parsed from the OCR text by the LLM.",
    )
    ocr_backend = fields.Char(string='OCR Backend Used', readonly=True, copy=False)
    ocr_confidence = fields.Float(string='OCR Confidence', readonly=True, copy=False)
    ocr_log_ids = fields.One2many('fusion.ocr.log', 'move_id', string='OCR Runs')
    def action_request_ocr(self):
        """Run OCR on the most recent supported attachment of each move."""
        for move in self:
            if move.move_type not in ('in_invoice', 'in_refund'):
                raise UserError(_("OCR currently supports vendor bills only."))
            attachment = self.env['ir.attachment'].sudo().search(
                [
                    ('res_model', '=', 'account.move'),
                    ('res_id', '=', move.id),
                    ('mimetype', 'in', SUPPORTED_MIMETYPES),
                ],
                order='create_date desc',
                limit=1,
            )
            if not attachment:
                raise UserError(
                    _("No PDF or image attachment found on %s") % (move.name or move.id)
                )
            move._fusion_run_ocr(attachment)
        return True
    def _fusion_run_ocr(self, attachment):
        self.ensure_one()
        self.ocr_state = 'processing'
        backend_name = (
            self.company_id.fusion_ocr_default_backend
            if 'fusion_ocr_default_backend' in self.company_id._fields
            else 'tesseract'
        )
        provider = self._fusion_get_ocr_provider(backend_name)
        if not provider:
            self.ocr_state = 'manual'
            self.message_post(
                body=_("No OCR backend available; falling back to manual entry.")
            )
            return False
        try:
            data = base64.b64decode(attachment.datas)
            result = provider.extract(
                data, mimetype=attachment.mimetype or 'application/pdf'
            )
            self.write({
                'ocr_raw_text': result.raw_text,
                'ocr_backend': result.backend,
                'ocr_confidence': result.confidence,
            })
            self.env['fusion.ocr.log'].sudo().create({
                'move_id': self.id,
                'backend': result.backend,
                'confidence': result.confidence,
                'raw_text_length': len(result.raw_text or ''),
                'pages': result.pages,
                'error': result.error,
            })
            if not result.raw_text and result.error:
                self.ocr_state = 'failed'
                self.message_post(body=_("OCR failed: %s") % result.error)
                return False
            parsed = parse_invoice_fields(self.env, result.raw_text)
            self.ocr_extracted_data = parsed
            self.ocr_state = 'done'
            self._fusion_apply_ocr_fields(parsed)
            self.message_post(
                body=_("OCR complete: %s confidence %.0f%%") % (
                    result.backend, (result.confidence or 0) * 100,
                )
            )
            return True
        except Exception as e:
            _logger.exception("OCR run failed for move %s", self.id)
            self.ocr_state = 'failed'
            self.message_post(body=_("OCR error: %s") % e)
            return False
    def _fusion_get_ocr_provider(self, backend_name):
        if backend_name == 'tesseract' and TesseractAdapter.is_available():
            return TesseractAdapter()
        if backend_name == 'manual':
            return ManualAdapter()
        # Future adapters (mindee, google_doc_ai, ollama_vision) plug in
        # here. Fall back to whichever adapter is actually usable.
        if TesseractAdapter.is_available():
            return TesseractAdapter()
        return ManualAdapter()
    def _fusion_apply_ocr_fields(self, parsed):
        """Apply parsed fields to a draft invoice without overwriting any
        user-entered data. No-op on posted/cancelled invoices."""
        if self.state != 'draft':
            return
        vals = {}
        if parsed.get('invoice_date') and not self.invoice_date:
            try:
                vals['invoice_date'] = parsed['invoice_date']
            except Exception:
                pass
        if parsed.get('due_date') and not self.invoice_date_due:
            try:
                vals['invoice_date_due'] = parsed['due_date']
            except Exception:
                pass
        if parsed.get('invoice_number') and not self.ref:
            vals['ref'] = parsed['invoice_number']
        # Vendor: best-effort name match against existing supplier partners.
        # Never auto-create a partner; AP user confirms ambiguous matches.
        if parsed.get('vendor_name') and not self.partner_id:
            partner = self.env['res.partner'].sudo().search(
                [
                    ('name', '=ilike', parsed['vendor_name']),
                    ('supplier_rank', '>', 0),
                ],
                limit=1,
            )
            if partner:
                vals['partner_id'] = partner.id
        if vals:
            self.write(vals)
--- a/fusion_accounting_ocr/models/fusion_ocr_log.py
+++ b/fusion_accounting_ocr/models/fusion_ocr_log.py
@@ -0,0 +1,17 @@
 from odoo import fields, models
 class FusionOcrLog(models.Model):
    _name = 'fusion.ocr.log'
    _description = 'Fusion OCR Run Log'
    _order = 'create_date desc'
    move_id = fields.Many2one(
        'account.move', required=True, ondelete='cascade', index=True,
    )
    backend = fields.Char(required=True)
    confidence = fields.Float()
    raw_text_length = fields.Integer()
    pages = fields.Integer()
    error = fields.Text()
    create_date = fields.Datetime(readonly=True)
--- a/fusion_accounting_ocr/models/res_company.py
+++ b/fusion_accounting_ocr/models/res_company.py
@@ -0,0 +1,26 @@
 from odoo import fields, models
 class ResCompany(models.Model):
    _inherit = 'res.company'
    fusion_ocr_enabled = fields.Boolean(
        string='Enable Invoice OCR',
        default=False,
        help="When enabled, vendor bill attachments can be OCR'd via the "
             "configured backend.",
    )
    fusion_ocr_default_backend = fields.Selection(
        [
            ('tesseract', 'Tesseract (local, free)'),
            ('manual', 'Manual entry only'),
        ],
        default='tesseract',
        string='Default OCR Backend',
    )
    fusion_ocr_auto_run = fields.Boolean(
        string='Auto-run OCR on attachment',
        default=False,
        help="When enabled, OCR runs automatically when a PDF/image is "
             "attached to a vendor bill.",
    )
--- a/fusion_accounting_ocr/models/res_config_settings.py
+++ b/fusion_accounting_ocr/models/res_config_settings.py
@@ -0,0 +1,15 @@
 from odoo import fields, models
 class ResConfigSettings(models.TransientModel):
    _inherit = 'res.config.settings'
    fusion_ocr_enabled = fields.Boolean(
        related='company_id.fusion_ocr_enabled', readonly=False,
    )
    fusion_ocr_default_backend = fields.Selection(
        related='company_id.fusion_ocr_default_backend', readonly=False,
    )
    fusion_ocr_auto_run = fields.Boolean(
        related='company_id.fusion_ocr_auto_run', readonly=False,
    )
--- a/fusion_accounting_ocr/security/ir.model.access.csv
+++ b/fusion_accounting_ocr/security/ir.model.access.csv
@@ -0,0 +1,3 @@
 id,name,model_id:id,group_id:id,perm_read,perm_write,perm_create,perm_unlink
 access_fusion_ocr_log_user,fusion.ocr.log.user,model_fusion_ocr_log,base.group_user,1,0,0,0
 access_fusion_ocr_log_manager,fusion.ocr.log.manager,model_fusion_ocr_log,account.group_account_manager,1,1,1,1
--- a/fusion_accounting_ocr/services/init.py
+++ b/fusion_accounting_ocr/services/init.py
@@ -0,0 +1,3 @@
 from . import ocr_providers
 from . import attachment_to_image
 from . import invoice_field_parser
--- a/fusion_accounting_ocr/services/attachment_to_image.py
+++ b/fusion_accounting_ocr/services/attachment_to_image.py
@@ -0,0 +1,43 @@
 """Helper: turn an ir.attachment into a list of PIL.Image pages.
 Kept separate from the adapters so future backends (Ollama-vision, Mindee)
 that want PIL images directly don't have to re-implement the PDF rendering.
 """
 import base64
 import io
 import logging
 _logger = logging.getLogger(__name__)
 def attachment_to_pages(attachment):
    """Decode an ir.attachment into a list of PIL.Image pages.
    Returns ``[]`` on failure (caller should treat as no pages).
    """
    try:
        from PIL import Image
        from pdf2image import convert_from_bytes
    except ImportError as e:
        _logger.warning("attachment_to_pages requires PIL + pdf2image: %s", e)
        return []
    if not attachment or not attachment.datas:
        return []
    try:
        data = base64.b64decode(attachment.datas)
    except Exception as e:
        _logger.warning("Could not decode attachment %s: %s", attachment.id, e)
        return []
    mimetype = attachment.mimetype or ''
    is_pdf = mimetype == 'application/pdf' or data[:4] == b'%PDF'
    try:
        if is_pdf:
            return convert_from_bytes(data, dpi=200)
        return [Image.open(io.BytesIO(data))]
    except Exception as e:
        _logger.warning("Could not render attachment %s: %s", attachment.id, e)
        return []
--- a/fusion_accounting_ocr/services/invoice_field_parser.py
+++ b/fusion_accounting_ocr/services/invoice_field_parser.py
@@ -0,0 +1,150 @@
 """Stage-2 of the OCR pipeline: parse raw OCR text into structured invoice
 fields via the configured LLM provider.
 Mirrors the pattern in fusion_accounting_followup/services/followup_text_generator.py:
 look up an adapter by ir.config_parameter, fall back gracefully when no
 provider is configured, and never let an LLM hiccup nuke the OCR result.
 """
 import json
 import logging
 _logger = logging.getLogger(__name__)
 SYSTEM_PROMPT = (
    "You are an invoice field extraction assistant. You read raw OCR text "
    "from vendor bills and return a strict JSON object with the requested "
    "fields. You never include commentary or markdown fences. When a field "
    "cannot be determined from the text you return null for that field."
 )
 USER_PROMPT = """Given the raw OCR text of a vendor bill, return a JSON object
 with these fields (use null when unclear):
 {{
  "vendor_name": <string, the seller/vendor company name>,
  "invoice_number": <string, the bill or invoice reference number>,
  "invoice_date": <string, ISO format YYYY-MM-DD>,
  "due_date": <string or null, ISO format YYYY-MM-DD>,
  "currency": <string, ISO 4217 code like CAD/USD/EUR>,
  "subtotal": <number or null>,
  "tax_total": <number or null>,
  "total": <number, the grand total amount due>,
  "line_items": [
    {{"description": <string>, "quantity": <number or null>,
      "unit_price": <number or null>, "amount": <number or null>}}
  ]
 }}
 Return ONLY valid JSON, no commentary, no markdown fences.
 Raw OCR text:
 ---
 {text}
 ---
 """
 def parse_invoice_fields(env, raw_text: str, *, provider=None) -> dict:
    """Use the configured LLM provider to extract structured invoice fields.
    Returns a dict with the schema above. On any failure (no provider, bad
    JSON, network error, etc.) returns an all-null result so the OCR raw
    text is still preserved for the AP user.
    """
    if not raw_text or not raw_text.strip():
        return _empty_result()
    if provider is None:
        provider = _get_provider(env)
    if provider is None:
        _logger.info(
            "No LLM provider configured for OCR field parsing; "
            "raw OCR text preserved, fields left empty."
        )
        return _empty_result()
    try:
        truncated = raw_text[:12000]
        user = USER_PROMPT.format(text=truncated)
        response = provider.complete(
            system=SYSTEM_PROMPT,
            messages=[{'role': 'user', 'content': user}],
            max_tokens=1000,
            temperature=0.1,
        )
        content = response.get('content') if isinstance(response, dict) else response
        if not content:
            return _empty_result()
        # LLMs sometimes wrap JSON in ```json ... ``` despite instructions.
        content = content.strip()
        if content.startswith('```'):
            content = content.split('```', 2)[1]
            if content.startswith('json'):
                content = content[4:]
            content = content.rsplit('```', 1)[0]
        parsed = json.loads(content.strip())
        return {
            'vendor_name': parsed.get('vendor_name'),
            'invoice_number': parsed.get('invoice_number'),
            'invoice_date': parsed.get('invoice_date'),
            'due_date': parsed.get('due_date'),
            'currency': parsed.get('currency'),
            'subtotal': parsed.get('subtotal'),
            'tax_total': parsed.get('tax_total'),
            'total': parsed.get('total'),
            'line_items': parsed.get('line_items') or [],
        }
    except json.JSONDecodeError as e:
        _logger.warning("LLM returned non-JSON for OCR field parsing: %s", e)
        return _empty_result()
    except Exception as e:
        _logger.warning("OCR field parsing failed: %s", e)
        return _empty_result()
 def _empty_result():
    return {
        'vendor_name': None,
        'invoice_number': None,
        'invoice_date': None,
        'due_date': None,
        'currency': None,
        'subtotal': None,
        'tax_total': None,
        'total': None,
        'line_items': [],
    }
 def _get_provider(env):
    """Look up the LLM adapter via ir.config_parameter.
    Honours a feature-specific override
    (``fusion_accounting.provider.ocr_field_parsing``) before falling back
    to the suite-wide default (``fusion_accounting.provider.default``).
    Returns None when no adapter is configured/importable.
    """
    param = env['ir.config_parameter'].sudo()
    name = param.get_param('fusion_accounting.provider.ocr_field_parsing')
    if not name:
        name = param.get_param('fusion_accounting.provider.default')
    if not name:
        return None
    try:
        from odoo.addons.fusion_accounting_ai.services.adapters.openai_adapter import OpenAIAdapter
        from odoo.addons.fusion_accounting_ai.services.adapters.claude import ClaudeAdapter
    except ImportError:
        return None
    try:
        if name.startswith('openai'):
            return OpenAIAdapter(env)
        if name.startswith('claude'):
            return ClaudeAdapter(env)
    except Exception as e:
        _logger.warning("OCR field parser could not instantiate %s: %s", name, e)
        return None
    return None
--- a/fusion_accounting_ocr/services/ocr_providers/init.py
+++ b/fusion_accounting_ocr/services/ocr_providers/init.py
@@ -0,0 +1,3 @@
 from . import base
 from . import tesseract_adapter
 from . import manual_adapter
--- a/fusion_accounting_ocr/services/ocr_providers/base.py
+++ b/fusion_accounting_ocr/services/ocr_providers/base.py
@@ -0,0 +1,40 @@
 """OCRProvider contract - every backend must conform.
 Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
 (Mindee, Google Document AI, Ollama-vision) drop in alongside the default
 tesseract adapter without touching account.move.
 """
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
@dataclass
 class OCRResult:
    raw_text: str = ''
    confidence: float = 0.0  # 0.0–1.0
    pages: int = 0
    backend: str = ''
    error: str = ''
    metadata: dict = field(default_factory=dict)
 class OCRProvider(ABC):
    """Abstract OCR backend. Subclasses implement extract()."""
    name: str = 'base'
    @abstractmethod
    def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
        """Extract text from raw bytes.
        ``mimetype`` hints whether to PDF-render (poppler) or image-decode
        (PIL) the bytes. Implementations should still inspect the byte
        signature for safety.
        """
        ...
    @classmethod
    def is_available(cls) -> bool:
        """Return True if the backend's runtime deps are present."""
        return True
--- a/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
+++ b/fusion_accounting_ocr/services/ocr_providers/manual_adapter.py
@@ -0,0 +1,13 @@
 """Manual fallback adapter - no real OCR, just marks the document as
 'awaiting manual entry'. Used when no real OCR backend is available
 or when the user explicitly disables OCR.
 """
 from .base import OCRProvider, OCRResult
 class ManualAdapter(OCRProvider):
    name = 'manual'
    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
        return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')
--- a/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
+++ b/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py
@@ -0,0 +1,71 @@
 """Tesseract OCR adapter.
 Uses the system tesseract binary via pytesseract, with poppler-backed
 PDF rendering via pdf2image. Inside the container these are pre-installed:
 - tesseract-ocr 5.3.4
 - pytesseract 0.3.13
 - pdf2image 1.17.0
 - poppler-utils
 """
 import io
 import logging
 from .base import OCRProvider, OCRResult
 _logger = logging.getLogger(__name__)
 class TesseractAdapter(OCRProvider):
    name = 'tesseract'
    @classmethod
    def is_available(cls) -> bool:
        try:
            import pytesseract
            from pdf2image import convert_from_bytes  # noqa: F401
            from PIL import Image  # noqa: F401
            pytesseract.get_tesseract_version()
            return True
        except Exception as e:
            _logger.debug("TesseractAdapter not available: %s", e)
            return False
    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
        import pytesseract
        from pdf2image import convert_from_bytes
        from PIL import Image
        try:
            is_pdf = (
                mimetype == 'application/pdf'
                or (image_or_pdf_bytes[:4] == b'%PDF')
            )
            if is_pdf:
                pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
            else:
                img = Image.open(io.BytesIO(image_or_pdf_bytes))
                pages = [img]
            texts = []
            for p in pages:
                texts.append(pytesseract.image_to_string(p))
            full_text = '\n\f\n'.join(texts)
            # Heuristic confidence - tesseract has a per-word conf in
            # image_to_data, but a length proxy is fine for routing
            # decisions. Future: use pytesseract.image_to_data for a real
            # average word-level confidence.
            conf = min(1.0, len(full_text) / 1000.0)
            return OCRResult(
                raw_text=full_text,
                confidence=conf,
                pages=len(pages),
                backend='tesseract',
            )
        except Exception as e:
            _logger.warning("Tesseract OCR failed: %s", e)
            return OCRResult(
                raw_text='', confidence=0.0, pages=0,
                backend='tesseract', error=str(e),
            )
--- a/fusion_accounting_ocr/static/description/icon.png
+++ b/fusion_accounting_ocr/static/description/icon.png
--- a/fusion_accounting_ocr/tests/init.py
+++ b/fusion_accounting_ocr/tests/init.py
@@ -0,0 +1,3 @@
 from . import test_tesseract_adapter
 from . import test_invoice_ocr_flow
 from . import test_field_parser
--- a/fusion_accounting_ocr/tests/test_field_parser.py
+++ b/fusion_accounting_ocr/tests/test_field_parser.py
@@ -0,0 +1,74 @@
 from unittest.mock import MagicMock
 from odoo.tests import tagged
 from odoo.tests.common import TransactionCase
 from odoo.addons.fusion_accounting_ocr.services.invoice_field_parser import (
    parse_invoice_fields,
 )
@tagged('post_install', '-at_install')
 class TestFieldParser(TransactionCase):
    def test_parser_handles_empty_text(self):
        result = parse_invoice_fields(self.env, '')
        self.assertIsNone(result['total'])
        self.assertEqual(result['line_items'], [])
    def test_parser_handles_no_provider_gracefully(self):
        # Without an LLM provider configured, parse should return an empty
        # result dict rather than crashing.
        result = parse_invoice_fields(self.env, 'INVOICE 12345 Total $100')
        self.assertIn('total', result)
        self.assertIn('line_items', result)
        self.assertIsInstance(result['line_items'], list)
    def test_parser_consumes_clean_json(self):
        provider = MagicMock()
        provider.complete.return_value = {
            'content': (
                '{"vendor_name": "Acme Co", "invoice_number": "INV-1",'
                ' "invoice_date": "2026-04-20", "due_date": null,'
                ' "currency": "CAD", "subtotal": 90.0, "tax_total": 10.0,'
                ' "total": 100.0, "line_items": ['
                '{"description": "Widget", "quantity": 1, "unit_price": 90.0,'
                ' "amount": 90.0}]}'
            ),
        }
        result = parse_invoice_fields(self.env, 'raw text', provider=provider)
        self.assertEqual(result['vendor_name'], 'Acme Co')
        self.assertEqual(result['invoice_number'], 'INV-1')
        self.assertEqual(result['total'], 100.0)
        self.assertEqual(len(result['line_items']), 1)
        self.assertEqual(result['line_items'][0]['description'], 'Widget')
    def test_parser_strips_markdown_fences(self):
        provider = MagicMock()
        provider.complete.return_value = {
            'content': (
                '```json\n'
                '{"vendor_name": "Beta Ltd", "invoice_number": "B-2",'
                ' "invoice_date": null, "due_date": null, "currency": null,'
                ' "subtotal": null, "tax_total": null, "total": 5.5,'
                ' "line_items": []}\n'
                '```'
            ),
        }
        result = parse_invoice_fields(self.env, 'raw text', provider=provider)
        self.assertEqual(result['vendor_name'], 'Beta Ltd')
        self.assertEqual(result['total'], 5.5)
    def test_parser_returns_empty_on_invalid_json(self):
        provider = MagicMock()
        provider.complete.return_value = {'content': 'not json at all'}
        result = parse_invoice_fields(self.env, 'raw text', provider=provider)
        self.assertIsNone(result['total'])
        self.assertEqual(result['line_items'], [])
    def test_parser_returns_empty_on_provider_exception(self):
        provider = MagicMock()
        provider.complete.side_effect = RuntimeError('boom')
        result = parse_invoice_fields(self.env, 'raw text', provider=provider)
        self.assertIsNone(result['total'])
        self.assertEqual(result['line_items'], [])
--- a/fusion_accounting_ocr/tests/test_invoice_ocr_flow.py
+++ b/fusion_accounting_ocr/tests/test_invoice_ocr_flow.py
@@ -0,0 +1,117 @@
 import base64
 import io
 from unittest.mock import patch
 from PIL import Image, ImageDraw
 from odoo.exceptions import UserError
 from odoo.tests import tagged
 from odoo.tests.common import TransactionCase
@tagged('post_install', '-at_install')
 class TestInvoiceOcrFlow(TransactionCase):
    def setUp(self):
        super().setUp()
        self.partner = self.env['res.partner'].create({
            'name': 'Test Vendor',
            'supplier_rank': 1,
        })
        self.move = self.env['account.move'].create({
            'move_type': 'in_invoice',
            'partner_id': self.partner.id,
        })
    def test_ocr_state_default(self):
        self.assertEqual(self.move.ocr_state, 'not_requested')
    def test_action_request_ocr_no_attachment_raises(self):
        with self.assertRaises(UserError):
            self.move.action_request_ocr()
    def test_action_request_ocr_with_image(self):
        img = Image.new('RGB', (800, 120), color='white')
        draw = ImageDraw.Draw(img)
        try:
            from PIL import ImageFont
            font = ImageFont.truetype(
                '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 36,
            )
        except Exception:
            font = None
        draw.text((20, 30), "TOTAL $50.00 INV-9999", fill='black', font=font)
        buf = io.BytesIO()
        img.save(buf, format='PNG')
        self.env['ir.attachment'].create({
            'name': 'test_invoice.png',
            'datas': base64.b64encode(buf.getvalue()),
            'res_model': 'account.move',
            'res_id': self.move.id,
            'mimetype': 'image/png',
        })
        # Mock the LLM call to avoid a real API roundtrip.
        with patch(
            'odoo.addons.fusion_accounting_ocr.models.account_move.parse_invoice_fields',
            return_value={
                'vendor_name': None,
                'invoice_number': 'INV-9999',
                'invoice_date': None,
                'due_date': None,
                'currency': None,
                'subtotal': None,
                'tax_total': None,
                'total': 50.0,
                'line_items': [],
            },
        ):
            self.move.action_request_ocr()
        self.assertEqual(self.move.ocr_state, 'done')
        self.assertEqual(self.move.ocr_backend, 'tesseract')
        self.assertGreater(self.move.ocr_confidence, 0)
        self.assertIsNotNone(self.move.ocr_extracted_data)
        # Parsed invoice_number should land on the invoice's ref field.
        self.assertEqual(self.move.ref, 'INV-9999')
        # OCR log row was created.
        self.assertEqual(len(self.move.ocr_log_ids), 1)
        log = self.move.ocr_log_ids
        self.assertEqual(log.backend, 'tesseract')
        self.assertGreater(log.raw_text_length, 0)
    def test_apply_does_not_overwrite_user_entered_ref(self):
        self.move.ref = 'USER-SET-REF'
        img = Image.new('RGB', (400, 80), color='white')
        ImageDraw.Draw(img).text((10, 30), "INV-7777", fill='black')
        buf = io.BytesIO()
        img.save(buf, format='PNG')
        self.env['ir.attachment'].create({
            'name': 't.png',
            'datas': base64.b64encode(buf.getvalue()),
            'res_model': 'account.move',
            'res_id': self.move.id,
            'mimetype': 'image/png',
        })
        with patch(
            'odoo.addons.fusion_accounting_ocr.models.account_move.parse_invoice_fields',
            return_value={
                'vendor_name': None, 'invoice_number': 'INV-7777',
                'invoice_date': None, 'due_date': None, 'currency': None,
                'subtotal': None, 'tax_total': None, 'total': None,
                'line_items': [],
            },
        ):
            self.move.action_request_ocr()
        # User-entered ref must not be overwritten.
        self.assertEqual(self.move.ref, 'USER-SET-REF')
    def test_only_vendor_bills_supported(self):
        customer_invoice = self.env['account.move'].create({
            'move_type': 'out_invoice',
            'partner_id': self.partner.id,
        })
        with self.assertRaises(UserError):
            customer_invoice.action_request_ocr()
--- a/fusion_accounting_ocr/tests/test_tesseract_adapter.py
+++ b/fusion_accounting_ocr/tests/test_tesseract_adapter.py
@@ -0,0 +1,47 @@
 import io
 from PIL import Image, ImageDraw
 from odoo.tests import tagged
 from odoo.tests.common import TransactionCase
 from odoo.addons.fusion_accounting_ocr.services.ocr_providers.tesseract_adapter import (
    TesseractAdapter,
 )
@tagged('post_install', '-at_install')
 class TestTesseractAdapter(TransactionCase):
    def test_is_available(self):
        # In our container tesseract + pytesseract + pdf2image are pre-installed.
        self.assertTrue(TesseractAdapter.is_available())
    def test_extract_simple_text_image(self):
        # Generate a tiny PNG with the text "INVOICE 12345 Total $100".
        # Use a slightly larger image and try to load a TTF font for
        # tesseract reliability; fall back to default bitmap font otherwise.
        img = Image.new('RGB', (800, 120), color='white')
        draw = ImageDraw.Draw(img)
        try:
            from PIL import ImageFont
            font = ImageFont.truetype(
                '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 36,
            )
        except Exception:
            font = None
        draw.text((20, 30), "INVOICE 12345 Total $100", fill='black', font=font)
        buf = io.BytesIO()
        img.save(buf, format='PNG')
        png_bytes = buf.getvalue()
        adapter = TesseractAdapter()
        result = adapter.extract(png_bytes, mimetype='image/png')
        self.assertEqual(result.backend, 'tesseract')
        self.assertEqual(result.error, '')
        self.assertEqual(result.pages, 1)
        self.assertGreater(len(result.raw_text), 0)
        # Tesseract should pick up the digits at minimum.
        self.assertIn('12345', result.raw_text.replace(' ', ''))
--- a/fusion_accounting_ocr/views/account_move_views.xml
+++ b/fusion_accounting_ocr/views/account_move_views.xml
@@ -0,0 +1,45 @@
 <?xml version="1.0" encoding="utf-8"?>
 <odoo>
    <record id="view_move_form_inherit_fusion_ocr" model="ir.ui.view">
        <field name="name">account.move.form.inherit.fusion_ocr</field>
        <field name="model">account.move</field>
        <field name="inherit_id" ref="account.view_move_form"/>
        <field name="arch" type="xml">
            <xpath expr="//header" position="inside">
                <button name="action_request_ocr"
                        type="object"
                        string="Request OCR"
                        class="oe_highlight"
                        invisible="move_type not in ('in_invoice', 'in_refund') or ocr_state in ('processing', 'done')"/>
                <button name="action_request_ocr"
                        type="object"
                        string="Re-run OCR"
                        invisible="move_type not in ('in_invoice', 'in_refund') or ocr_state not in ('done', 'failed', 'manual')"/>
            </xpath>
            <xpath expr="//sheet" position="inside">
                <group string="Fusion OCR"
                       invisible="move_type not in ('in_invoice', 'in_refund') or ocr_state == 'not_requested'">
                    <group>
                        <field name="ocr_state" widget="badge"
                               decoration-success="ocr_state == 'done'"
                               decoration-info="ocr_state == 'processing'"
                               decoration-warning="ocr_state == 'manual'"
                               decoration-danger="ocr_state == 'failed'"/>
                        <field name="ocr_backend" readonly="1"/>
                        <field name="ocr_confidence" readonly="1" widget="percentage"/>
                    </group>
                    <group>
                        <field name="ocr_extracted_data" readonly="1" widget="text"/>
                    </group>
                    <field name="ocr_raw_text" readonly="1" nolabel="1"
                           placeholder="Raw OCR text..."/>
                </group>
            </xpath>
        </field>
    </record>
 </odoo>
--- a/fusion_accounting_ocr/views/res_config_settings_views.xml
+++ b/fusion_accounting_ocr/views/res_config_settings_views.xml
@@ -0,0 +1,35 @@
 <?xml version="1.0" encoding="utf-8"?>
 <odoo>
    <record id="res_config_settings_view_form_inherit_fusion_ocr" model="ir.ui.view">
        <field name="name">res.config.settings.view.form.inherit.fusion_ocr</field>
        <field name="model">res.config.settings</field>
        <field name="inherit_id" ref="account.res_config_settings_view_form"/>
        <field name="arch" type="xml">
            <xpath expr="//block[@id='account_vendor_bills']" position="after">
                <block title="Fusion Invoice OCR" id="fusion_ocr_section">
                    <setting id="fusion_ocr_enabled_setting"
                             string="Enable Invoice OCR"
                             help="OCR vendor bill attachments via the configured backend.">
                        <field name="fusion_ocr_enabled"/>
                        <div class="content-group" invisible="not fusion_ocr_enabled">
                            <div class="mt16">
                                <label for="fusion_ocr_default_backend"
                                       string="Default OCR Backend" class="o_light_label"/>
                                <field name="fusion_ocr_default_backend"/>
                            </div>
                            <div class="mt16">
                                <field name="fusion_ocr_auto_run"/>
                                <label for="fusion_ocr_auto_run"
                                       string="Auto-run OCR on attachment"/>
                            </div>
                        </div>
                    </setting>
                </block>
            </xpath>
        </field>
    </record>
 </odoo>
		`@@ -0,0 +1,2 @@`
							`from . import models`
							`from . import controllers`