Odoo-Modules/fusion_plating/fusion_plating_certificates/lib/fischerscope_parser.py

# -*- coding: utf-8 -*-
# Copyright 2026 Nexa Systems Inc.
# License OPL-1 (Odoo Proprietary License v1.0)
#
# Fischerscope XDAL 600 thickness-report parser.
#
# Input: bytes of a .docx or .pdf file exported by the gauge.
# Output: dict with `readings` (list of per-reading dicts), `metadata`
# (single dict with equipment/calibration/operator info), and `image`
# (raw bytes of the embedded microscope image, when extractable).
#
# Pure-Python, no Odoo imports. Suitable for direct unit testing.

import io
import logging
import re
from datetime import datetime

_logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Regexes — derived from the real Fischerscope XDAL 600 export layout.
# Sample line:
#   n=    1 NiP 1=   0.6885 mils Ni 1 =   91.323 %  P  1 =    8.6771 %
# Spaces vary; allow flexible whitespace + optional channel digit after NiP/Ni/P.
# ---------------------------------------------------------------------------
_READING_RE = re.compile(
    r"""n\s*=\s*(?P<n>\d+)        # reading number
        \s+NiP\s*\d*\s*=\s*       # NiP label (channel number optional)
        (?P<nip>[\d.]+)\s*mils    # NiP thickness in mils
        \s+Ni\s*\d*\s*=\s*        # Ni label
        (?P<ni>[\d.]+)\s*%        # Ni percentage
        \s+P\s*\d*\s*=\s*         # P label
        (?P<p>[\d.]+)\s*%         # P percentage
    """,
    re.VERBOSE,
)

# Equipment model — first non-blank line that contains "Fischerscope" or
# similar gauge identifier. Captures everything up to end of line.
_EQUIPMENT_RE = re.compile(
    r'(Fischerscope[^\n\r]*)',
    re.IGNORECASE,
)

# Product ref: "Product: 2805031 / NiP/Al-alloys 2805030"
_PRODUCT_RE = re.compile(
    r'Product\s*:\s*([^\n\r]+?)(?:\s*$|\s*\n)',
    re.IGNORECASE | re.MULTILINE,
)

# Calibration set: "Calibr. Std. Set NiP/Al STD SET SN 100174568"
_CALIBR_RE = re.compile(
    r'Calibr\.?\s*Std\.?\s*Set\s*([^\n\r]+?)(?:\s*$|\s*\n)',
    re.IGNORECASE | re.MULTILINE,
)

# Measuring time: "Measuring time   120 sec"
_MEAS_TIME_RE = re.compile(
    r'Measuring\s*time\s*:?\s*(\d+)\s*sec',
    re.IGNORECASE,
)

# Operator: "Operator:  BK"  (initials or short name)
# Stop the capture at: 2+ whitespace, a newline, end-of-string, 2+ digits,
# or end-of-line in multiline mode. The bare "Operator: BK\nDate: ..."
# case (operator name immediately followed by newline + next field) was
# the bug that fell through every other branch.
_OPERATOR_RE = re.compile(
    r'Operator\s*:?\s*([A-Za-z][A-Za-z0-9 .\-]{0,40}?)(?=\s{2,}|\n|$|\s*\d{2,})',
    re.IGNORECASE | re.MULTILINE,
)

# Date + Time: "Date:  5/15/2026 Time: 12:24:46 PM"
_DATETIME_RE = re.compile(
    r'Date\s*:?\s*(\d{1,2}/\d{1,2}/\d{2,4})'
    r'\s*Time\s*:?\s*(\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)?)',
    re.IGNORECASE,
)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def parse_fischerscope_file(filename, content_bytes):
    """Parse a Fischerscope thickness report.

    Branches on file extension:
      .docx → python-docx (paragraphs + inline_shapes for the image)
      .pdf  → PyPDF2 (text per page; image extraction best-effort)

    Returns:
      {
        'success': bool,            # True if at least one reading was parsed
        'readings': [               # list of per-reading dicts
            {'reading_number': int, 'nip_mils': float,
             'ni_percent': float, 'p_percent': float},
            ...
        ],
        'metadata': {               # may have None values for missing keys
            'equipment_model': str | None,
            'product_ref': str | None,
            'calibration_std_ref': str | None,
            'measuring_time_seconds': int | None,
            'operator_name': str | None,
            'reading_datetime': datetime | None,
        },
        'image': bytes | None,      # microscope image, if extractable
        'image_mime': str | None,   # image/jpeg, image/png, etc.
        'raw_text': str,            # extracted text (for debug / fallback)
        'errors': [str],            # non-fatal warnings encountered
      }

    Never raises on parse failure — returns success=False with readings=[].
    Raises only on unrecoverable I/O (e.g. corrupted file bytes).
    """
    name = (filename or '').lower()
    if name.endswith('.docx'):
        return _parse_docx(content_bytes)
    if name.endswith('.pdf'):
        return _parse_pdf(content_bytes)
    if name.endswith('.doc'):
        return _failed_result(
            raw_text='',
            error=(
                'Legacy .doc format not supported — re-export from the '
                'gauge as .docx or .pdf. (python-docx reads .docx only; '
                'old binary .doc needs LibreOffice conversion which '
                "isn't installed.)"
            ),
        )
    return _failed_result(
        raw_text='',
        error='Unsupported file extension: %r. Expected .docx or .pdf.' % filename,
    )


# ---------------------------------------------------------------------------
# Internals
# ---------------------------------------------------------------------------

def _parse_docx(content_bytes):
    """Parse a .docx Fischerscope report."""
    errors = []
    try:
        import docx  # python-docx
    except ImportError:
        return _failed_result(
            raw_text='',
            error='python-docx not installed — cannot parse .docx files.',
        )
    try:
        doc = docx.Document(io.BytesIO(content_bytes))
    except Exception as e:
        return _failed_result(raw_text='', error='Could not open .docx: %s' % e)

    # Build the raw text by walking paragraphs AND tables. Fischerscope
    # exports vary — sometimes the readings are in a table, sometimes
    # in justified paragraphs. Joining everything gives the regex a
    # stable target.
    parts = []
    for para in doc.paragraphs:
        text = para.text
        if text:
            parts.append(text)
    for tbl in doc.tables:
        for row in tbl.rows:
            row_text = '  '.join(cell.text for cell in row.cells)
            if row_text.strip():
                parts.append(row_text)
    raw_text = '\n'.join(parts)

    # Image: walk inline_shapes + image-parts; pick the first one. The
    # Fischerscope export embeds exactly one microscope image per report.
    image_bytes = None
    image_mime = None
    try:
        for rel in doc.part.rels.values():
            if 'image' in (rel.reltype or '').lower():
                img_part = rel.target_part
                image_bytes = img_part.blob
                image_mime = img_part.content_type
                break
    except Exception as e:
        errors.append('image extraction failed: %s' % e)

    return _build_result(raw_text, errors, image_bytes, image_mime)


def _parse_pdf(content_bytes):
    """Parse a .pdf Fischerscope report. Text-based PDFs only."""
    errors = []
    try:
        from PyPDF2 import PdfReader
    except ImportError:
        return _failed_result(
            raw_text='',
            error='PyPDF2 not installed — cannot parse .pdf files.',
        )
    try:
        reader = PdfReader(io.BytesIO(content_bytes))
    except Exception as e:
        return _failed_result(raw_text='', error='Could not open PDF: %s' % e)

    raw_text_parts = []
    for i, page in enumerate(reader.pages):
        try:
            raw_text_parts.append(page.extract_text() or '')
        except Exception as e:
            errors.append('page %d extract_text failed: %s' % (i + 1, e))
    raw_text = '\n'.join(raw_text_parts)

    # PDF image extraction is unreliable across PDF producers. Best-
    # effort: walk page resources looking for /XObject /Image entries.
    # If anything fails, drop image silently — the operator still has
    # the original file attached.
    image_bytes = None
    image_mime = None
    try:
        for page in reader.pages:
            resources = page.get('/Resources')
            if not resources:
                continue
            xobjects = resources.get('/XObject')
            if not xobjects:
                continue
            x_resolved = xobjects.get_object() if hasattr(xobjects, 'get_object') else xobjects
            for obj_name in x_resolved:
                obj = x_resolved[obj_name]
                obj = obj.get_object() if hasattr(obj, 'get_object') else obj
                if obj.get('/Subtype') == '/Image':
                    image_bytes = obj.get_data()
                    f = obj.get('/Filter')
                    if f == '/DCTDecode':
                        image_mime = 'image/jpeg'
                    elif f == '/FlateDecode':
                        image_mime = 'image/png'
                    else:
                        image_mime = 'application/octet-stream'
                    break
            if image_bytes:
                break
    except Exception as e:
        errors.append('PDF image extraction failed: %s' % e)
        image_bytes = None

    return _build_result(raw_text, errors, image_bytes, image_mime)


def _build_result(raw_text, errors, image_bytes, image_mime):
    """Run the regex extractor over raw_text and assemble the result dict."""
    readings = []
    for m in _READING_RE.finditer(raw_text):
        try:
            readings.append({
                'reading_number': int(m.group('n')),
                'nip_mils':       float(m.group('nip')),
                'ni_percent':     float(m.group('ni')),
                'p_percent':      float(m.group('p')),
            })
        except (ValueError, TypeError) as e:
            errors.append('reading parse error at offset %d: %s' % (m.start(), e))

    metadata = {
        'equipment_model':        _capture(_EQUIPMENT_RE, raw_text),
        'product_ref':            _capture(_PRODUCT_RE, raw_text),
        'calibration_std_ref':    _capture(_CALIBR_RE, raw_text),
        'measuring_time_seconds': _capture_int(_MEAS_TIME_RE, raw_text),
        'operator_name':          _capture(_OPERATOR_RE, raw_text),
        'reading_datetime':       _capture_datetime(raw_text),
    }

    return {
        'success':  bool(readings),
        'readings': readings,
        'metadata': metadata,
        'image':    image_bytes,
        'image_mime': image_mime,
        'raw_text': raw_text,
        'errors':   errors,
    }


def _failed_result(raw_text, error):
    return {
        'success':  False,
        'readings': [],
        'metadata': {
            'equipment_model':        None,
            'product_ref':            None,
            'calibration_std_ref':    None,
            'measuring_time_seconds': None,
            'operator_name':          None,
            'reading_datetime':       None,
        },
        'image':      None,
        'image_mime': None,
        'raw_text':   raw_text,
        'errors':     [error] if error else [],
    }


def _capture(rx, text):
    m = rx.search(text or '')
    if not m:
        return None
    val = m.group(1).strip()
    return val or None


def _capture_int(rx, text):
    m = rx.search(text or '')
    if not m:
        return None
    try:
        return int(m.group(1))
    except (ValueError, TypeError):
        return None


def _capture_datetime(text):
    m = _DATETIME_RE.search(text or '')
    if not m:
        return None
    date_str, time_str = m.group(1).strip(), m.group(2).strip()
    # Try a few likely formats; the gauge can emit either MM/DD/YYYY or
    # M/D/YY plus 12h or 24h.
    for date_fmt in ('%m/%d/%Y', '%m/%d/%y', '%d/%m/%Y', '%d/%m/%y'):
        for time_fmt in ('%I:%M:%S %p', '%I:%M %p', '%H:%M:%S', '%H:%M'):
            try:
                return datetime.strptime('%s %s' % (date_str, time_str),
                                         '%s %s' % (date_fmt, time_fmt))
            except ValueError:
                continue
    return None