This commit is contained in:
gsinghpal
2026-05-16 13:18:52 -04:00
parent 191a9c82be
commit 9ebf89bde2
1080 changed files with 0 additions and 1197 deletions

View File

@@ -0,0 +1,3 @@
from . import base
from . import tesseract_adapter
from . import manual_adapter

View File

@@ -0,0 +1,40 @@
"""OCRProvider contract - every backend must conform.
Mirrors the LLMProvider pattern in fusion_accounting_ai. Future adapters
(Mindee, Google Document AI, Ollama-vision) drop in alongside the default
tesseract adapter without touching account.move.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
@dataclass
class OCRResult:
raw_text: str = ''
confidence: float = 0.0 # 0.01.0
pages: int = 0
backend: str = ''
error: str = ''
metadata: dict = field(default_factory=dict)
class OCRProvider(ABC):
"""Abstract OCR backend. Subclasses implement extract()."""
name: str = 'base'
@abstractmethod
def extract(self, image_or_pdf_bytes: bytes, *, mimetype: str = 'application/pdf') -> OCRResult:
"""Extract text from raw bytes.
``mimetype`` hints whether to PDF-render (poppler) or image-decode
(PIL) the bytes. Implementations should still inspect the byte
signature for safety.
"""
...
@classmethod
def is_available(cls) -> bool:
"""Return True if the backend's runtime deps are present."""
return True

View File

@@ -0,0 +1,13 @@
"""Manual fallback adapter - no real OCR, just marks the document as
'awaiting manual entry'. Used when no real OCR backend is available
or when the user explicitly disables OCR.
"""
from .base import OCRProvider, OCRResult
class ManualAdapter(OCRProvider):
name = 'manual'
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
return OCRResult(raw_text='', confidence=0.0, pages=0, backend='manual')

View File

@@ -0,0 +1,71 @@
"""Tesseract OCR adapter.
Uses the system tesseract binary via pytesseract, with poppler-backed
PDF rendering via pdf2image. Inside the container these are pre-installed:
- tesseract-ocr 5.3.4
- pytesseract 0.3.13
- pdf2image 1.17.0
- poppler-utils
"""
import io
import logging
from .base import OCRProvider, OCRResult
_logger = logging.getLogger(__name__)
class TesseractAdapter(OCRProvider):
name = 'tesseract'
@classmethod
def is_available(cls) -> bool:
try:
import pytesseract
from pdf2image import convert_from_bytes # noqa: F401
from PIL import Image # noqa: F401
pytesseract.get_tesseract_version()
return True
except Exception as e:
_logger.debug("TesseractAdapter not available: %s", e)
return False
def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
try:
is_pdf = (
mimetype == 'application/pdf'
or (image_or_pdf_bytes[:4] == b'%PDF')
)
if is_pdf:
pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
else:
img = Image.open(io.BytesIO(image_or_pdf_bytes))
pages = [img]
texts = []
for p in pages:
texts.append(pytesseract.image_to_string(p))
full_text = '\n\f\n'.join(texts)
# Heuristic confidence - tesseract has a per-word conf in
# image_to_data, but a length proxy is fine for routing
# decisions. Future: use pytesseract.image_to_data for a real
# average word-level confidence.
conf = min(1.0, len(full_text) / 1000.0)
return OCRResult(
raw_text=full_text,
confidence=conf,
pages=len(pages),
backend='tesseract',
)
except Exception as e:
_logger.warning("Tesseract OCR failed: %s", e)
return OCRResult(
raw_text='', confidence=0.0, pages=0,
backend='tesseract', error=str(e),
)