Odoo-Modules/fusion_accounting_ocr/services/ocr_providers/tesseract_adapter.py

"""Tesseract OCR adapter.

Uses the system tesseract binary via pytesseract, with poppler-backed
PDF rendering via pdf2image. Inside the container these are pre-installed:
- tesseract-ocr 5.3.4
- pytesseract 0.3.13
- pdf2image 1.17.0
- poppler-utils
"""

import io
import logging

from .base import OCRProvider, OCRResult

_logger = logging.getLogger(__name__)


class TesseractAdapter(OCRProvider):
    name = 'tesseract'

    @classmethod
    def is_available(cls) -> bool:
        try:
            import pytesseract
            from pdf2image import convert_from_bytes  # noqa: F401
            from PIL import Image  # noqa: F401
            pytesseract.get_tesseract_version()
            return True
        except Exception as e:
            _logger.debug("TesseractAdapter not available: %s", e)
            return False

    def extract(self, image_or_pdf_bytes, *, mimetype='application/pdf'):
        import pytesseract
        from pdf2image import convert_from_bytes
        from PIL import Image

        try:
            is_pdf = (
                mimetype == 'application/pdf'
                or (image_or_pdf_bytes[:4] == b'%PDF')
            )
            if is_pdf:
                pages = convert_from_bytes(image_or_pdf_bytes, dpi=200)
            else:
                img = Image.open(io.BytesIO(image_or_pdf_bytes))
                pages = [img]

            texts = []
            for p in pages:
                texts.append(pytesseract.image_to_string(p))
            full_text = '\n\f\n'.join(texts)

            # Heuristic confidence - tesseract has a per-word conf in
            # image_to_data, but a length proxy is fine for routing
            # decisions. Future: use pytesseract.image_to_data for a real
            # average word-level confidence.
            conf = min(1.0, len(full_text) / 1000.0)
            return OCRResult(
                raw_text=full_text,
                confidence=conf,
                pages=len(pages),
                backend='tesseract',
            )
        except Exception as e:
            _logger.warning("Tesseract OCR failed: %s", e)
            return OCRResult(
                raw_text='', confidence=0.0, pages=0,
                backend='tesseract', error=str(e),
            )