41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
"""OCRProvider contract - every backend must conform.
|
||
|
||
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
|
||
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
|
||
tesseract adapter without touching account.move.
|
||
"""
|
||
|
||
from abc import ABC, abstractmethod
|
||
from dataclasses import dataclass, field
|
||
|
||
|
||
@dataclass
|
||
class OCRResult:
|
||
raw_text: str = ''
|
||
confidence: float = 0.0 # 0.0–1.0
|
||
pages: int = 0
|
||
backend: str = ''
|
||
error: str = ''
|
||
metadata: dict = field(default_factory=dict)
|
||
|
||
|
||
class OCRProvider(ABC):
|
||
"""Abstract OCR backend. Subclasses implement extract()."""
|
||
|
||
name: str = 'base'
|
||
|
||
@abstractmethod
|
||
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
|
||
"""Extract text from raw bytes.
|
||
|
||
``mimetype`` hints whether to PDF-render (poppler) or image-decode
|
||
(PIL) the bytes. Implementations should still inspect the byte
|
||
signature for safety.
|
||
"""
|
||
...
|
||
|
||
@classmethod
|
||
def is_available(cls) -> bool:
|
||
"""Return True if the backend's runtime deps are present."""
|
||
return True
|