changes
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
"""OCRProvider contract - every backend must conform.
|
||||
|
||||
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
|
||||
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
|
||||
tesseract adapter without touching account.move.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
raw_text: str = ''
|
||||
confidence: float = 0.0 # 0.0–1.0
|
||||
pages: int = 0
|
||||
backend: str = ''
|
||||
error: str = ''
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class OCRProvider(ABC):
|
||||
"""Abstract OCR backend. Subclasses implement extract()."""
|
||||
|
||||
name: str = 'base'
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
|
||||
"""Extract text from raw bytes.
|
||||
|
||||
``mimetype`` hints whether to PDF-render (poppler) or image-decode
|
||||
(PIL) the bytes. Implementations should still inspect the byte
|
||||
signature for safety.
|
||||
"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
"""Return True if the backend's runtime deps are present."""
|
||||
return True
|
||||
Reference in New Issue
Block a user