Initial commit

2026-02-22 01:22:18 -05:00
commit 5200d5baf0
2394 changed files with 386834 additions and 0 deletions
--- a/fusion_accounts/models/ai_bill_extractor.py
+++ b/fusion_accounts/models/ai_bill_extractor.py
@@ -0,0 +1,614 @@
+# -*- coding: utf-8 -*-
+# Copyright 2026 Nexa Systems Inc.
+# License OPL-1 (Odoo Proprietary License v1.0)
+
+import base64
+import json
+import logging
+import re
+
+from odoo import models
+
+_logger = logging.getLogger(__name__)
+
+EXTRACTION_PROMPT = """You are an accounts payable assistant. Extract billing information from the attached invoice/bill document and email.
+
+IMPORTANT RULES:
+- The PDF attachment is the PRIMARY source of truth. Always prefer data from the PDF over the email body.
+- "vendor_name" = the company that ISSUED the invoice/bill (the seller/supplier name on the document), NOT the email sender.
+- "invoice_number" = the Invoice Number, Bill Number, Reference Number, or Sales Order Number printed on the document.
+- "invoice_date" = the date the invoice was issued (not the email date).
+- "due_date" = the payment due date on the invoice.
+- For line items, extract each product/service line with description, quantity, unit price, and line total.
+
+Return ONLY valid JSON with this exact structure (use null for missing values):
+{
+    "vendor_name": "string - the company name that issued the bill",
+    "invoice_number": "string - invoice/bill/reference number",
+    "invoice_date": "YYYY-MM-DD",
+    "due_date": "YYYY-MM-DD",
+    "currency": "CAD or USD",
+    "subtotal": 0.00,
+    "tax_amount": 0.00,
+    "total_amount": 0.00,
+    "po_reference": "string or null - any PO reference on the document",
+    "lines": [
+        {
+            "description": "string",
+            "quantity": 1.0,
+            "unit_price": 0.00,
+            "amount": 0.00
+        }
+    ]
+}
+
+If you cannot determine a value, use null. For lines, include as many as you can find.
+Do NOT include any text outside the JSON object."""
+
+
+class AIBillExtractor(models.AbstractModel):
+    _name = 'fusion.accounts.ai.extractor'
+    _description = 'AI Bill Data Extractor'
+
+    def _get_api_key(self):
+        """Get the OpenAI API key from settings."""
+        return self.env['ir.config_parameter'].sudo().get_param(
+            'fusion_accounts.openai_api_key', ''
+        )
+
+    def _get_ai_model(self):
+        """Get the configured AI model."""
+        return self.env['ir.config_parameter'].sudo().get_param(
+            'fusion_accounts.ai_model', 'gpt-4o-mini'
+        )
+
+    def _get_max_pages(self):
+        """Get the max PDF pages to process."""
+        try:
+            return int(self.env['ir.config_parameter'].sudo().get_param(
+                'fusion_accounts.ai_max_pages', '2'
+            ))
+        except (ValueError, TypeError):
+            return 2
+
+    def _is_ai_enabled(self):
+        """Check if AI extraction is enabled."""
+        return self.env['ir.config_parameter'].sudo().get_param(
+            'fusion_accounts.ai_enabled', 'True'
+        ) == 'True'
+
+    def extract_bill_data_from_raw(self, email_body, raw_attachments=None):
+        """Extract bill data using raw attachments from msg_dict.
+
+        Raw attachments come as a list that can contain:
+        - tuples: (filename, content_bytes, info_dict)
+        - ir.attachment records (if already created)
+
+        Args:
+            email_body: HTML email body
+            raw_attachments: list from msg_dict['attachments']
+
+        Returns:
+            dict with extracted data, or empty dict on failure
+        """
+        if not self._is_ai_enabled():
+            _logger.info("AI extraction is disabled")
+            return {}
+
+        api_key = self._get_api_key()
+        if not api_key:
+            _logger.warning("No OpenAI API key configured")
+            return {}
+
+        try:
+            import requests as req_lib
+        except ImportError:
+            _logger.error("requests library not available")
+            return {}
+
+        clean_body = self._strip_html(email_body or '')
+        content_parts = []
+        has_pdf_content = False
+
+        # Process raw attachments from msg_dict
+        if raw_attachments:
+            for att in raw_attachments[:3]:
+                fname = ''
+                content = None
+
+                if hasattr(att, 'datas'):
+                    # ir.attachment record
+                    fname = att.name or ''
+                    content = base64.b64decode(att.datas) if att.datas else None
+                    mimetype = att.mimetype or ''
+                elif hasattr(att, 'fname') and hasattr(att, 'content'):
+                    # Odoo Attachment namedtuple (fname, content, info)
+                    fname = att.fname or ''
+                    content = att.content if isinstance(att.content, bytes) else None
+                    mimetype = getattr(att, 'info', {}).get('content_type', '') if hasattr(att, 'info') and att.info else ''
+                elif isinstance(att, (tuple, list)) and len(att) >= 2:
+                    # (filename, content_bytes, ...) tuple
+                    fname = att[0] or ''
+                    content = att[1] if isinstance(att[1], bytes) else None
+                    mimetype = ''
+                else:
+                    continue
+
+                # Determine mimetype from filename if not set
+                if not mimetype:
+                    if fname.lower().endswith('.pdf'):
+                        mimetype = 'application/pdf'
+                    elif fname.lower().endswith(('.png', '.jpg', '.jpeg')):
+                        mimetype = 'image/' + fname.rsplit('.', 1)[-1].lower()
+
+                if not content:
+                    continue
+
+                _logger.info("Processing attachment: %s (%d bytes)", fname, len(content))
+
+                if fname.lower().endswith('.pdf') or mimetype == 'application/pdf':
+                    # Convert PDF to images
+                    pdf_images = self._pdf_bytes_to_images(content)
+                    if pdf_images:
+                        has_pdf_content = True
+                        for img_data in pdf_images:
+                            content_parts.append({
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_data}",
+                                    "detail": "high",
+                                }
+                            })
+                    else:
+                        # Fallback: text extraction
+                        pdf_text = self._pdf_bytes_to_text(content)
+                        if pdf_text:
+                            has_pdf_content = True
+                            content_parts.append({
+                                "type": "text",
+                                "text": f"INVOICE/BILL DOCUMENT:\n{pdf_text[:8000]}"
+                            })
+                elif mimetype.startswith('image/'):
+                    has_pdf_content = True
+                    img_b64 = base64.b64encode(content).decode()
+                    content_parts.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{mimetype};base64,{img_b64}",
+                            "detail": "high",
+                        }
+                    })
+
+        # Email body as secondary context
+        if clean_body and not has_pdf_content:
+            content_parts.append({
+                "type": "text",
+                "text": f"EMAIL BODY (no invoice attachment):\n{clean_body[:5000]}"
+            })
+        elif clean_body and has_pdf_content:
+            content_parts.append({
+                "type": "text",
+                "text": f"ADDITIONAL CONTEXT FROM EMAIL:\n{clean_body[:2000]}"
+            })
+
+        if not content_parts:
+            _logger.info("No content to extract from")
+            return {}
+
+        # Call OpenAI API
+        model = self._get_ai_model()
+        messages = [
+            {"role": "system", "content": EXTRACTION_PROMPT},
+            {"role": "user", "content": content_parts},
+        ]
+
+        try:
+            response = req_lib.post(
+                'https://api.openai.com/v1/chat/completions',
+                headers={
+                    'Authorization': f'Bearer {api_key}',
+                    'Content-Type': 'application/json',
+                },
+                json={
+                    'model': model,
+                    'messages': messages,
+                    'max_tokens': 2000,
+                    'temperature': 0.1,
+                },
+                timeout=60,
+            )
+            response.raise_for_status()
+            result = response.json()
+            content = result['choices'][0]['message']['content']
+
+            content = content.strip()
+            if content.startswith('```'):
+                lines = content.split('\n')
+                content = '\n'.join(lines[1:-1] if lines[-1].strip() == '```' else lines[1:])
+                content = content.strip()
+
+            if not content:
+                _logger.warning("AI returned empty response")
+                return {}
+
+            extracted = json.loads(content)
+            _logger.info("AI extraction successful: %s", json.dumps(extracted, indent=2)[:500])
+            return extracted
+
+        except Exception as e:
+            _logger.error("AI extraction failed: %s", e)
+            return {}
+
+    def _pdf_bytes_to_images(self, pdf_bytes):
+        """Convert raw PDF bytes to base64 PNG images."""
+        max_pages = self._get_max_pages()
+        images = []
+        try:
+            import fitz
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+                img_data = base64.b64encode(pix.tobytes("png")).decode()
+                images.append(img_data)
+                _logger.info("Converted PDF page %d to image (%d bytes)", page_num + 1, len(img_data))
+            doc.close()
+        except ImportError:
+            _logger.warning("PyMuPDF not available")
+        except Exception as e:
+            _logger.warning("PDF to image failed: %s", e)
+        return images
+
+    def _pdf_bytes_to_text(self, pdf_bytes):
+        """Extract text from raw PDF bytes."""
+        max_pages = self._get_max_pages()
+        try:
+            import fitz
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            parts = []
+            for page_num in range(min(len(doc), max_pages)):
+                parts.append(doc[page_num].get_text())
+            doc.close()
+            return '\n'.join(parts)
+        except Exception:
+            return ''
+
+    def extract_bill_data(self, email_body, attachments=None):
+        """Extract bill data from email body and attachments using OpenAI.
+
+        Args:
+            email_body: Plain text or HTML email body
+            attachments: List of ir.attachment records
+
+        Returns:
+            dict with extracted data, or empty dict on failure
+        """
+        if not self._is_ai_enabled():
+            _logger.info("AI extraction is disabled")
+            return {}
+
+        api_key = self._get_api_key()
+        if not api_key:
+            _logger.warning("No OpenAI API key configured for Fusion Accounts")
+            return {}
+
+        try:
+            import requests
+        except ImportError:
+            _logger.error("requests library not available")
+            return {}
+
+        # Clean HTML from email body
+        clean_body = self._strip_html(email_body or '')
+
+        # Build messages for OpenAI
+        messages = [
+            {"role": "system", "content": EXTRACTION_PROMPT},
+        ]
+
+        # Build content -- PDF attachments FIRST (primary source), email body second
+        content_parts = []
+        has_pdf_content = False
+
+        # Add PDF/image attachments first (these are the invoice documents)
+        if attachments:
+            for attachment in attachments[:3]:  # Max 3 attachments
+                if attachment.mimetype == 'application/pdf':
+                    # Try image conversion first (best for AI vision)
+                    pdf_images = self._pdf_to_images(attachment)
+                    if pdf_images:
+                        has_pdf_content = True
+                        for img_data in pdf_images:
+                            content_parts.append({
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{img_data}",
+                                    "detail": "high",
+                                }
+                            })
+                    else:
+                        # Fallback: extract text from PDF
+                        pdf_text = self._pdf_to_text(attachment)
+                        if pdf_text:
+                            has_pdf_content = True
+                            content_parts.append({
+                                "type": "text",
+                                "text": f"INVOICE/BILL DOCUMENT:\n{pdf_text[:8000]}"
+                            })
+                elif attachment.mimetype in ('image/png', 'image/jpeg', 'image/jpg'):
+                    has_pdf_content = True
+                    img_b64 = base64.b64encode(base64.b64decode(attachment.datas)).decode()
+                    content_parts.append({
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{attachment.mimetype};base64,{img_b64}",
+                            "detail": "high",
+                        }
+                    })
+
+        # Add email body as secondary context (only if no PDF content found)
+        if clean_body and not has_pdf_content:
+            content_parts.append({
+                "type": "text",
+                "text": f"EMAIL BODY (no invoice attachment found):\n{clean_body[:5000]}"
+            })
+        elif clean_body and has_pdf_content:
+            content_parts.append({
+                "type": "text",
+                "text": f"ADDITIONAL CONTEXT FROM EMAIL:\n{clean_body[:2000]}"
+            })
+
+        if not content_parts:
+            _logger.info("No content to extract from")
+            return {}
+
+        messages.append({"role": "user", "content": content_parts})
+
+        # Call OpenAI API
+        model = self._get_ai_model()
+        try:
+            response = requests.post(
+                'https://api.openai.com/v1/chat/completions',
+                headers={
+                    'Authorization': f'Bearer {api_key}',
+                    'Content-Type': 'application/json',
+                },
+                json={
+                    'model': model,
+                    'messages': messages,
+                    'max_tokens': 2000,
+                    'temperature': 0.1,
+                },
+                timeout=60,
+            )
+            response.raise_for_status()
+            result = response.json()
+            content = result['choices'][0]['message']['content']
+
+            # Parse JSON from response -- handle markdown code fences
+            content = content.strip()
+            if content.startswith('```'):
+                # Remove ```json ... ``` wrapper
+                lines = content.split('\n')
+                content = '\n'.join(lines[1:-1] if lines[-1].strip() == '```' else lines[1:])
+                content = content.strip()
+
+            if not content:
+                _logger.warning("AI returned empty response")
+                return {}
+
+            extracted = json.loads(content)
+            _logger.info("AI extraction successful: %s", json.dumps(extracted, indent=2)[:500])
+            return extracted
+
+        except requests.exceptions.RequestException as e:
+            _logger.error("OpenAI API request failed: %s", e)
+            return {}
+        except (json.JSONDecodeError, KeyError, IndexError) as e:
+            _logger.warning("Failed to parse AI response: %s (content: %s)", e, content[:200] if content else 'empty')
+            return {}
+
+    def apply_extracted_data(self, move, extracted_data):
+        """Apply AI-extracted data to a draft vendor bill.
+
+        The PDF/invoice is the source of truth for:
+        - Vendor name (matched to Odoo contact)
+        - Invoice/bill number (ref)
+        - Invoice date, due date
+        - Line items
+
+        Args:
+            move: account.move record (draft vendor bill)
+            extracted_data: dict from extract_bill_data()
+        """
+        if not extracted_data:
+            return
+
+        vals = {}
+
+        # --- Vendor matching from AI-extracted vendor name ---
+        # This overrides the email sender match because the PDF
+        # shows the actual billing company (e.g., "Canada Computers Inc.")
+        ai_vendor_name = extracted_data.get('vendor_name')
+        if ai_vendor_name:
+            partner = self._match_vendor_by_name(ai_vendor_name)
+            if partner:
+                vals['partner_id'] = partner.id
+                _logger.info("AI vendor match: '%s' -> %s (id=%d)",
+                             ai_vendor_name, partner.name, partner.id)
+
+        # Invoice reference (vendor's invoice/bill/SO number)
+        if extracted_data.get('invoice_number'):
+            vals['ref'] = extracted_data['invoice_number']
+
+        # Invoice date
+        if extracted_data.get('invoice_date'):
+            try:
+                from datetime import datetime
+                vals['invoice_date'] = datetime.strptime(
+                    extracted_data['invoice_date'], '%Y-%m-%d'
+                ).date()
+            except (ValueError, TypeError):
+                pass
+
+        # Due date
+        if extracted_data.get('due_date'):
+            try:
+                from datetime import datetime
+                vals['invoice_date_due'] = datetime.strptime(
+                    extracted_data['due_date'], '%Y-%m-%d'
+                ).date()
+            except (ValueError, TypeError):
+                pass
+
+        if vals:
+            try:
+                move.write(vals)
+                _logger.info("Applied AI data to bill %s: %s", move.id, vals)
+            except Exception as e:
+                _logger.error("Failed to apply AI data to bill %s: %s", move.id, e)
+
+        # Add invoice lines if extracted
+        lines = extracted_data.get('lines', [])
+        if lines and not move.invoice_line_ids:
+            line_vals_list = []
+            for line in lines[:20]:  # Max 20 lines
+                line_vals = {
+                    'move_id': move.id,
+                    'name': line.get('description', 'Extracted line'),
+                    'quantity': line.get('quantity', 1.0),
+                    'price_unit': line.get('unit_price', 0.0),
+                }
+                line_vals_list.append(line_vals)
+
+            if line_vals_list:
+                try:
+                    move.write({
+                        'invoice_line_ids': [(0, 0, lv) for lv in line_vals_list]
+                    })
+                    _logger.info("Added %d AI-extracted lines to bill %s",
+                                 len(line_vals_list), move.id)
+                except Exception as e:
+                    _logger.error("Failed to add lines to bill %s: %s", move.id, e)
+
+    def _match_vendor_by_name(self, vendor_name):
+        """Match AI-extracted vendor name to an Odoo partner.
+
+        Tries multiple strategies:
+        1. Exact name match
+        2. Commercial company name match
+        3. Partial/contains match (only if single result)
+
+        Returns: res.partner record or False
+        """
+        if not vendor_name or len(vendor_name) < 3:
+            return False
+
+        Partner = self.env['res.partner'].sudo()
+        vendor_name = vendor_name.strip()
+
+        # Level 1: Exact name match
+        partner = Partner.search([
+            ('name', '=ilike', vendor_name),
+            ('supplier_rank', '>', 0),
+        ], limit=1)
+        if partner:
+            return partner
+
+        # Level 2: Exact name match without supplier_rank filter
+        partner = Partner.search([
+            ('name', '=ilike', vendor_name),
+        ], limit=1)
+        if partner:
+            return partner
+
+        # Level 3: Commercial company name match
+        partner = Partner.search([
+            ('commercial_company_name', '=ilike', vendor_name),
+        ], limit=1)
+        if partner:
+            return partner
+
+        # Level 4: Contains match (only accept single result to avoid false positives)
+        partners = Partner.search([
+            '|',
+            ('name', 'ilike', vendor_name),
+            ('commercial_company_name', 'ilike', vendor_name),
+        ])
+        if len(partners) == 1:
+            return partners
+
+        # Level 5: Try without common suffixes (Inc, Ltd, Corp, etc.)
+        clean_name = vendor_name
+        for suffix in [' Inc', ' Inc.', ' Ltd', ' Ltd.', ' Corp', ' Corp.',
+                       ' Co', ' Co.', ' LLC', ' Company', ' Limited']:
+            if clean_name.lower().endswith(suffix.lower()):
+                clean_name = clean_name[:len(clean_name) - len(suffix)].strip()
+                break
+
+        if clean_name != vendor_name and len(clean_name) >= 3:
+            partners = Partner.search([
+                '|',
+                ('name', 'ilike', clean_name),
+                ('commercial_company_name', 'ilike', clean_name),
+            ])
+            if len(partners) == 1:
+                return partners
+
+        _logger.info("No vendor match for AI-extracted name: '%s'", vendor_name)
+        return False
+
+    def _strip_html(self, html):
+        """Strip HTML tags from text."""
+        clean = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
+        clean = re.sub(r'<script[^>]*>.*?</script>', '', clean, flags=re.DOTALL)
+        clean = re.sub(r'<[^>]+>', ' ', clean)
+        clean = re.sub(r'\s+', ' ', clean).strip()
+        return clean
+
+    def _pdf_to_images(self, attachment):
+        """Convert PDF attachment pages to base64 PNG images using PyMuPDF."""
+        max_pages = self._get_max_pages()
+        images = []
+
+        try:
+            import fitz  # PyMuPDF
+            pdf_data = base64.b64decode(attachment.datas)
+            doc = fitz.open(stream=pdf_data, filetype="pdf")
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for readability
+                img_data = base64.b64encode(pix.tobytes("png")).decode()
+                images.append(img_data)
+                _logger.info("Converted PDF page %d to image (%d bytes)", page_num + 1, len(img_data))
+            doc.close()
+        except ImportError:
+            _logger.warning("PyMuPDF not available, will try text extraction fallback")
+        except Exception as e:
+            _logger.warning("PDF to image conversion failed: %s", e)
+
+        return images
+
+    def _pdf_to_text(self, attachment):
+        """Extract text content from PDF as fallback when image conversion fails."""
+        max_pages = self._get_max_pages()
+
+        try:
+            import fitz  # PyMuPDF
+            pdf_data = base64.b64decode(attachment.datas)
+            doc = fitz.open(stream=pdf_data, filetype="pdf")
+            text_parts = []
+            for page_num in range(min(len(doc), max_pages)):
+                page = doc[page_num]
+                text_parts.append(page.get_text())
+            doc.close()
+            full_text = '\n'.join(text_parts)
+            if full_text.strip():
+                _logger.info("Extracted %d chars of text from PDF", len(full_text))
+                return full_text
+        except ImportError:
+            pass
+        except Exception as e:
+            _logger.warning("PDF text extraction failed: %s", e)
+
+        return ''