This commit is contained in:
gsinghpal
2026-05-16 13:18:52 -04:00
parent 191a9c82be
commit 9ebf89bde2
1080 changed files with 0 additions and 1197 deletions

View File

@@ -0,0 +1,40 @@
"""OCRProvider contract - every backend must conform.
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
tesseract adapter without touching account.move.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
@dataclass
class OCRResult:
raw_text: str = ''
confidence: float = 0.0 # 0.01.0
pages: int = 0
backend: str = ''
error: str = ''
metadata: dict = field(default_factory=dict)
class OCRProvider(ABC):
"""Abstract OCR backend. Subclasses implement extract()."""
name: str = 'base'
@abstractmethod
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
"""Extract text from raw bytes.
``mimetype`` hints whether to PDF-render (poppler) or image-decode
(PIL) the bytes. Implementations should still inspect the byte
signature for safety.
"""
...
@classmethod
def is_available(cls) -> bool:
"""Return True if the backend's runtime deps are present."""
return True