# -*- coding: utf-8 -*- # Copyright 2026 Nexa Systems Inc. # License OPL-1 (Odoo Proprietary License v1.0) import base64 import json import logging import re from odoo import models _logger = logging.getLogger(__name__) EXTRACTION_PROMPT = """You are an accounts payable assistant. Extract billing information from the attached invoice/bill document and email. IMPORTANT RULES: - The PDF attachment is the PRIMARY source of truth. Always prefer data from the PDF over the email body. - "vendor_name" = the company that ISSUED the invoice/bill (the seller/supplier name on the document), NOT the email sender. - "invoice_number" = the Invoice Number, Bill Number, Reference Number, or Sales Order Number printed on the document. - "invoice_date" = the date the invoice was issued (not the email date). - "due_date" = the payment due date on the invoice. - For line items, extract each product/service line with description, quantity, unit price, and line total. Return ONLY valid JSON with this exact structure (use null for missing values): { "vendor_name": "string - the company name that issued the bill", "invoice_number": "string - invoice/bill/reference number", "invoice_date": "YYYY-MM-DD", "due_date": "YYYY-MM-DD", "currency": "CAD or USD", "subtotal": 0.00, "tax_amount": 0.00, "total_amount": 0.00, "po_reference": "string or null - any PO reference on the document", "lines": [ { "description": "string", "quantity": 1.0, "unit_price": 0.00, "amount": 0.00 } ] } If you cannot determine a value, use null. For lines, include as many as you can find. Do NOT include any text outside the JSON object.""" class AIBillExtractor(models.AbstractModel): _name = 'fusion.accounts.ai.extractor' _description = 'AI Bill Data Extractor' def _get_api_key(self): """Get the OpenAI API key from settings.""" return self.env['ir.config_parameter'].sudo().get_param( 'fusion_accounts.openai_api_key', '' ) def _get_ai_model(self): """Get the configured AI model.""" return self.env['ir.config_parameter'].sudo().get_param( 'fusion_accounts.ai_model', 'gpt-4o-mini' ) def _get_max_pages(self): """Get the max PDF pages to process.""" try: return int(self.env['ir.config_parameter'].sudo().get_param( 'fusion_accounts.ai_max_pages', '2' )) except (ValueError, TypeError): return 2 def _is_ai_enabled(self): """Check if AI extraction is enabled.""" return self.env['ir.config_parameter'].sudo().get_param( 'fusion_accounts.ai_enabled', 'True' ) == 'True' def extract_bill_data_from_raw(self, email_body, raw_attachments=None): """Extract bill data using raw attachments from msg_dict. Raw attachments come as a list that can contain: - tuples: (filename, content_bytes, info_dict) - ir.attachment records (if already created) Args: email_body: HTML email body raw_attachments: list from msg_dict['attachments'] Returns: dict with extracted data, or empty dict on failure """ if not self._is_ai_enabled(): _logger.info("AI extraction is disabled") return {} api_key = self._get_api_key() if not api_key: _logger.warning("No OpenAI API key configured") return {} try: import requests as req_lib except ImportError: _logger.error("requests library not available") return {} clean_body = self._strip_html(email_body or '') content_parts = [] has_pdf_content = False # Process raw attachments from msg_dict if raw_attachments: for att in raw_attachments[:3]: fname = '' content = None if hasattr(att, 'datas'): # ir.attachment record fname = att.name or '' content = base64.b64decode(att.datas) if att.datas else None mimetype = att.mimetype or '' elif hasattr(att, 'fname') and hasattr(att, 'content'): # Odoo Attachment namedtuple (fname, content, info) fname = att.fname or '' content = att.content if isinstance(att.content, bytes) else None mimetype = getattr(att, 'info', {}).get('content_type', '') if hasattr(att, 'info') and att.info else '' elif isinstance(att, (tuple, list)) and len(att) >= 2: # (filename, content_bytes, ...) tuple fname = att[0] or '' content = att[1] if isinstance(att[1], bytes) else None mimetype = '' else: continue # Determine mimetype from filename if not set if not mimetype: if fname.lower().endswith('.pdf'): mimetype = 'application/pdf' elif fname.lower().endswith(('.png', '.jpg', '.jpeg')): mimetype = 'image/' + fname.rsplit('.', 1)[-1].lower() if not content: continue _logger.info("Processing attachment: %s (%d bytes)", fname, len(content)) if fname.lower().endswith('.pdf') or mimetype == 'application/pdf': # Convert PDF to images pdf_images = self._pdf_bytes_to_images(content) if pdf_images: has_pdf_content = True for img_data in pdf_images: content_parts.append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_data}", "detail": "high", } }) else: # Fallback: text extraction pdf_text = self._pdf_bytes_to_text(content) if pdf_text: has_pdf_content = True content_parts.append({ "type": "text", "text": f"INVOICE/BILL DOCUMENT:\n{pdf_text[:8000]}" }) elif mimetype.startswith('image/'): has_pdf_content = True img_b64 = base64.b64encode(content).decode() content_parts.append({ "type": "image_url", "image_url": { "url": f"data:{mimetype};base64,{img_b64}", "detail": "high", } }) # Email body as secondary context if clean_body and not has_pdf_content: content_parts.append({ "type": "text", "text": f"EMAIL BODY (no invoice attachment):\n{clean_body[:5000]}" }) elif clean_body and has_pdf_content: content_parts.append({ "type": "text", "text": f"ADDITIONAL CONTEXT FROM EMAIL:\n{clean_body[:2000]}" }) if not content_parts: _logger.info("No content to extract from") return {} # Call OpenAI API model = self._get_ai_model() messages = [ {"role": "system", "content": EXTRACTION_PROMPT}, {"role": "user", "content": content_parts}, ] try: response = req_lib.post( 'https://api.openai.com/v1/chat/completions', headers={ 'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json', }, json={ 'model': model, 'messages': messages, 'max_tokens': 2000, 'temperature': 0.1, }, timeout=60, ) response.raise_for_status() result = response.json() content = result['choices'][0]['message']['content'] content = content.strip() if content.startswith('```'): lines = content.split('\n') content = '\n'.join(lines[1:-1] if lines[-1].strip() == '```' else lines[1:]) content = content.strip() if not content: _logger.warning("AI returned empty response") return {} extracted = json.loads(content) _logger.info("AI extraction successful: %s", json.dumps(extracted, indent=2)[:500]) return extracted except Exception as e: _logger.error("AI extraction failed: %s", e) return {} def _pdf_bytes_to_images(self, pdf_bytes): """Convert raw PDF bytes to base64 PNG images.""" max_pages = self._get_max_pages() images = [] try: import fitz doc = fitz.open(stream=pdf_bytes, filetype="pdf") for page_num in range(min(len(doc), max_pages)): page = doc[page_num] pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) img_data = base64.b64encode(pix.tobytes("png")).decode() images.append(img_data) _logger.info("Converted PDF page %d to image (%d bytes)", page_num + 1, len(img_data)) doc.close() except ImportError: _logger.warning("PyMuPDF not available") except Exception as e: _logger.warning("PDF to image failed: %s", e) return images def _pdf_bytes_to_text(self, pdf_bytes): """Extract text from raw PDF bytes.""" max_pages = self._get_max_pages() try: import fitz doc = fitz.open(stream=pdf_bytes, filetype="pdf") parts = [] for page_num in range(min(len(doc), max_pages)): parts.append(doc[page_num].get_text()) doc.close() return '\n'.join(parts) except Exception: return '' def extract_bill_data(self, email_body, attachments=None): """Extract bill data from email body and attachments using OpenAI. Args: email_body: Plain text or HTML email body attachments: List of ir.attachment records Returns: dict with extracted data, or empty dict on failure """ if not self._is_ai_enabled(): _logger.info("AI extraction is disabled") return {} api_key = self._get_api_key() if not api_key: _logger.warning("No OpenAI API key configured for Fusion Accounts") return {} try: import requests except ImportError: _logger.error("requests library not available") return {} # Clean HTML from email body clean_body = self._strip_html(email_body or '') # Build messages for OpenAI messages = [ {"role": "system", "content": EXTRACTION_PROMPT}, ] # Build content -- PDF attachments FIRST (primary source), email body second content_parts = [] has_pdf_content = False # Add PDF/image attachments first (these are the invoice documents) if attachments: for attachment in attachments[:3]: # Max 3 attachments if attachment.mimetype == 'application/pdf': # Try image conversion first (best for AI vision) pdf_images = self._pdf_to_images(attachment) if pdf_images: has_pdf_content = True for img_data in pdf_images: content_parts.append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_data}", "detail": "high", } }) else: # Fallback: extract text from PDF pdf_text = self._pdf_to_text(attachment) if pdf_text: has_pdf_content = True content_parts.append({ "type": "text", "text": f"INVOICE/BILL DOCUMENT:\n{pdf_text[:8000]}" }) elif attachment.mimetype in ('image/png', 'image/jpeg', 'image/jpg'): has_pdf_content = True img_b64 = base64.b64encode(base64.b64decode(attachment.datas)).decode() content_parts.append({ "type": "image_url", "image_url": { "url": f"data:{attachment.mimetype};base64,{img_b64}", "detail": "high", } }) # Add email body as secondary context (only if no PDF content found) if clean_body and not has_pdf_content: content_parts.append({ "type": "text", "text": f"EMAIL BODY (no invoice attachment found):\n{clean_body[:5000]}" }) elif clean_body and has_pdf_content: content_parts.append({ "type": "text", "text": f"ADDITIONAL CONTEXT FROM EMAIL:\n{clean_body[:2000]}" }) if not content_parts: _logger.info("No content to extract from") return {} messages.append({"role": "user", "content": content_parts}) # Call OpenAI API model = self._get_ai_model() try: response = requests.post( 'https://api.openai.com/v1/chat/completions', headers={ 'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json', }, json={ 'model': model, 'messages': messages, 'max_tokens': 2000, 'temperature': 0.1, }, timeout=60, ) response.raise_for_status() result = response.json() content = result['choices'][0]['message']['content'] # Parse JSON from response -- handle markdown code fences content = content.strip() if content.startswith('```'): # Remove ```json ... ``` wrapper lines = content.split('\n') content = '\n'.join(lines[1:-1] if lines[-1].strip() == '```' else lines[1:]) content = content.strip() if not content: _logger.warning("AI returned empty response") return {} extracted = json.loads(content) _logger.info("AI extraction successful: %s", json.dumps(extracted, indent=2)[:500]) return extracted except requests.exceptions.RequestException as e: _logger.error("OpenAI API request failed: %s", e) return {} except (json.JSONDecodeError, KeyError, IndexError) as e: _logger.warning("Failed to parse AI response: %s (content: %s)", e, content[:200] if content else 'empty') return {} def apply_extracted_data(self, move, extracted_data): """Apply AI-extracted data to a draft vendor bill. The PDF/invoice is the source of truth for: - Vendor name (matched to Odoo contact) - Invoice/bill number (ref) - Invoice date, due date - Line items Args: move: account.move record (draft vendor bill) extracted_data: dict from extract_bill_data() """ if not extracted_data: return vals = {} # --- Vendor matching from AI-extracted vendor name --- # This overrides the email sender match because the PDF # shows the actual billing company (e.g., "Canada Computers Inc.") ai_vendor_name = extracted_data.get('vendor_name') if ai_vendor_name: partner = self._match_vendor_by_name(ai_vendor_name) if partner: vals['partner_id'] = partner.id _logger.info("AI vendor match: '%s' -> %s (id=%d)", ai_vendor_name, partner.name, partner.id) # Invoice reference (vendor's invoice/bill/SO number) if extracted_data.get('invoice_number'): vals['ref'] = extracted_data['invoice_number'] # Invoice date if extracted_data.get('invoice_date'): try: from datetime import datetime vals['invoice_date'] = datetime.strptime( extracted_data['invoice_date'], '%Y-%m-%d' ).date() except (ValueError, TypeError): pass # Due date if extracted_data.get('due_date'): try: from datetime import datetime vals['invoice_date_due'] = datetime.strptime( extracted_data['due_date'], '%Y-%m-%d' ).date() except (ValueError, TypeError): pass if vals: try: move.write(vals) _logger.info("Applied AI data to bill %s: %s", move.id, vals) except Exception as e: _logger.error("Failed to apply AI data to bill %s: %s", move.id, e) # Add invoice lines if extracted lines = extracted_data.get('lines', []) if lines and not move.invoice_line_ids: line_vals_list = [] for line in lines[:20]: # Max 20 lines line_vals = { 'move_id': move.id, 'name': line.get('description', 'Extracted line'), 'quantity': line.get('quantity', 1.0), 'price_unit': line.get('unit_price', 0.0), } line_vals_list.append(line_vals) if line_vals_list: try: move.write({ 'invoice_line_ids': [(0, 0, lv) for lv in line_vals_list] }) _logger.info("Added %d AI-extracted lines to bill %s", len(line_vals_list), move.id) except Exception as e: _logger.error("Failed to add lines to bill %s: %s", move.id, e) def _match_vendor_by_name(self, vendor_name): """Match AI-extracted vendor name to an Odoo partner. Tries multiple strategies: 1. Exact name match 2. Commercial company name match 3. Partial/contains match (only if single result) Returns: res.partner record or False """ if not vendor_name or len(vendor_name) < 3: return False Partner = self.env['res.partner'].sudo() vendor_name = vendor_name.strip() # Level 1: Exact name match partner = Partner.search([ ('name', '=ilike', vendor_name), ('supplier_rank', '>', 0), ], limit=1) if partner: return partner # Level 2: Exact name match without supplier_rank filter partner = Partner.search([ ('name', '=ilike', vendor_name), ], limit=1) if partner: return partner # Level 3: Commercial company name match partner = Partner.search([ ('commercial_company_name', '=ilike', vendor_name), ], limit=1) if partner: return partner # Level 4: Contains match (only accept single result to avoid false positives) partners = Partner.search([ '|', ('name', 'ilike', vendor_name), ('commercial_company_name', 'ilike', vendor_name), ]) if len(partners) == 1: return partners # Level 5: Try without common suffixes (Inc, Ltd, Corp, etc.) clean_name = vendor_name for suffix in [' Inc', ' Inc.', ' Ltd', ' Ltd.', ' Corp', ' Corp.', ' Co', ' Co.', ' LLC', ' Company', ' Limited']: if clean_name.lower().endswith(suffix.lower()): clean_name = clean_name[:len(clean_name) - len(suffix)].strip() break if clean_name != vendor_name and len(clean_name) >= 3: partners = Partner.search([ '|', ('name', 'ilike', clean_name), ('commercial_company_name', 'ilike', clean_name), ]) if len(partners) == 1: return partners _logger.info("No vendor match for AI-extracted name: '%s'", vendor_name) return False def _strip_html(self, html): """Strip HTML tags from text.""" clean = re.sub(r']*>.*?', '', html, flags=re.DOTALL) clean = re.sub(r']*>.*?', '', clean, flags=re.DOTALL) clean = re.sub(r'<[^>]+>', ' ', clean) clean = re.sub(r'\s+', ' ', clean).strip() return clean def _pdf_to_images(self, attachment): """Convert PDF attachment pages to base64 PNG images using PyMuPDF.""" max_pages = self._get_max_pages() images = [] try: import fitz # PyMuPDF pdf_data = base64.b64decode(attachment.datas) doc = fitz.open(stream=pdf_data, filetype="pdf") for page_num in range(min(len(doc), max_pages)): page = doc[page_num] pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for readability img_data = base64.b64encode(pix.tobytes("png")).decode() images.append(img_data) _logger.info("Converted PDF page %d to image (%d bytes)", page_num + 1, len(img_data)) doc.close() except ImportError: _logger.warning("PyMuPDF not available, will try text extraction fallback") except Exception as e: _logger.warning("PDF to image conversion failed: %s", e) return images def _pdf_to_text(self, attachment): """Extract text content from PDF as fallback when image conversion fails.""" max_pages = self._get_max_pages() try: import fitz # PyMuPDF pdf_data = base64.b64decode(attachment.datas) doc = fitz.open(stream=pdf_data, filetype="pdf") text_parts = [] for page_num in range(min(len(doc), max_pages)): page = doc[page_num] text_parts.append(page.get_text()) doc.close() full_text = '\n'.join(text_parts) if full_text.strip(): _logger.info("Extracted %d chars of text from PDF", len(full_text)) return full_text except ImportError: pass except Exception as e: _logger.warning("PDF text extraction failed: %s", e) return ''