changes
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
from . import base
|
||||
from . import tesseract_adapter
|
||||
from . import manual_adapter
|
||||
@@ -0,0 +1,40 @@
|
||||
"""OCRProvider contract - every backend must conform.
|
||||
|
||||
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
|
||||
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
|
||||
tesseract adapter without touching account.move.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
raw_text: str = ''
|
||||
confidence: float = 0.0 # 0.0–1.0
|
||||
pages: int = 0
|
||||
backend: str = ''
|
||||
error: str = ''
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class OCRProvider(ABC):
|
||||
"""Abstract OCR backend. Subclasses implement extract()."""
|
||||
|
||||
name: str = 'base'
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
|
||||
"""Extract text from raw bytes.
|
||||
|
||||
``mimetype`` hints whether to PDF-render (poppler) or image-decode
|
||||
(PIL) the bytes. Implementations should still inspect the byte
|
||||
signature for safety.
|
||||
"""
|
||||
...
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
"""Return True if the backend's runtime deps are present."""
|
||||
return True
|
||||
@@ -0,0 +1,13 @@
|
||||
"""Manual fallback adapter - no real OCR, just marks the document as
|
||||
'awaiting manual entry'. Used when no real OCR backend is available
|
||||
or when the user explicitly disables OCR.
|
||||
"""
|
||||
|
||||
from .base import OCRProvider, OCRResult
|
||||
|
||||
|
||||
class ManualAdapter(OCRProvider):
|
||||
name = 'manual'
|
||||
|
||||
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
|
||||
return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')
|
||||
@@ -0,0 +1,71 @@
|
||||
"""Tesseract OCR adapter.
|
||||
|
||||
Uses the system tesseract binary via pytesseract, with poppler-backed
|
||||
PDF rendering via pdf2image. Inside the container these are pre-installed:
|
||||
- tesseract-ocr 5.3.4
|
||||
- pytesseract 0.3.13
|
||||
- pdf2image 1.17.0
|
||||
- poppler-utils
|
||||
"""
|
||||
|
||||
import io
|
||||
import logging
|
||||
|
||||
from .base import OCRProvider, OCRResult
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractAdapter(OCRProvider):
|
||||
name = 'tesseract'
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
try:
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_bytes # noqa: F401
|
||||
from PIL import Image # noqa: F401
|
||||
pytesseract.get_tesseract_version()
|
||||
return True
|
||||
except Exception as e:
|
||||
_logger.debug("TesseractAdapter not available: %s", e)
|
||||
return False
|
||||
|
||||
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_bytes
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
is_pdf = (
|
||||
mimetype == 'application/pdf'
|
||||
or (image_or_pdf_bytes[:4] == b'%PDF')
|
||||
)
|
||||
if is_pdf:
|
||||
pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
|
||||
else:
|
||||
img = Image.open(io.BytesIO(image_or_pdf_bytes))
|
||||
pages = [img]
|
||||
|
||||
texts = []
|
||||
for p in pages:
|
||||
texts.append(pytesseract.image_to_string(p))
|
||||
full_text = '\n\f\n'.join(texts)
|
||||
|
||||
# Heuristic confidence - tesseract has a per-word conf in
|
||||
# image_to_data, but a length proxy is fine for routing
|
||||
# decisions. Future: use pytesseract.image_to_data for a real
|
||||
# average word-level confidence.
|
||||
conf = min(1.0, len(full_text) / 1000.0)
|
||||
return OCRResult(
|
||||
raw_text=full_text,
|
||||
confidence=conf,
|
||||
pages=len(pages),
|
||||
backend='tesseract',
|
||||
)
|
||||
except Exception as e:
|
||||
_logger.warning("Tesseract OCR failed: %s", e)
|
||||
return OCRResult(
|
||||
raw_text='', confidence=0.0, pages=0,
|
||||
backend='tesseract', error=str(e),
|
||||
)
|
||||
Reference in New Issue
Block a user