"""Tesseract OCR adapter. Uses the system tesseract binary via pytesseract, with poppler-backed PDF rendering via pdf2image. Inside the container these are pre-installed: - tesseract-ocr 5.3.4 - pytesseract 0.3.13 - pdf2image 1.17.0 - poppler-utils """ import io import logging from .base import OCRProvider, OCRResult _logger = logging.getLogger(__name__) class TesseractAdapter(OCRProvider): name = 'tesseract' @classmethod def is_available(cls) -> bool: try: import pytesseract from pdf2image import convert_from_bytes # noqa: F401 from PIL import Image # noqa: F401 pytesseract.get_tesseract_version() return True except Exception as e: _logger.debug("TesseractAdapter not available: %s", e) return False def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'): import pytesseract from pdf2image import convert_from_bytes from PIL import Image try: is_pdf = ( mimetype == 'application/pdf' or (image_or_pdf_bytes[:4] == b'%PDF') ) if is_pdf: pages = convert_from_bytes(image_or_pdf_bytes, dpi=200) else: img = Image.open(io.BytesIO(image_or_pdf_bytes)) pages = [img] texts = [] for p in pages: texts.append(pytesseract.image_to_string(p)) full_text = '\n\f\n'.join(texts) # Heuristic confidence - tesseract has a per-word conf in # image_to_data, but a length proxy is fine for routing # decisions. Future: use pytesseract.image_to_data for a real # average word-level confidence. conf = min(1.0, len(full_text) / 1000.0) return OCRResult( raw_text=full_text, confidence=conf, pages=len(pages), backend='tesseract', ) except Exception as e: _logger.warning("Tesseract OCR failed: %s", e) return OCRResult( raw_text='', confidence=0.0, pages=0, backend='tesseract', error=str(e), )