# -*- coding: utf-8 -*- # Copyright 2026 Nexa Systems Inc. # License OPL-1 (Odoo Proprietary License v1.0) # # Fischerscope XDAL 600 thickness-report parser. # # Input: bytes of a .docx or .pdf file exported by the gauge. # Output: dict with `readings` (list of per-reading dicts), `metadata` # (single dict with equipment/calibration/operator info), and `image` # (raw bytes of the embedded microscope image, when extractable). # # Pure-Python, no Odoo imports. Suitable for direct unit testing. import io import logging import re from datetime import datetime _logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Regexes — derived from the real Fischerscope XDAL 600 export layout. # Sample line: # n= 1 NiP 1= 0.6885 mils Ni 1 = 91.323 % P 1 = 8.6771 % # Spaces vary; allow flexible whitespace + optional channel digit after NiP/Ni/P. # --------------------------------------------------------------------------- _READING_RE = re.compile( r"""n\s*=\s*(?P\d+) # reading number \s+NiP\s*\d*\s*=\s* # NiP label (channel number optional) (?P[\d.]+)\s*mils # NiP thickness in mils \s+Ni\s*\d*\s*=\s* # Ni label (?P[\d.]+)\s*% # Ni percentage \s+P\s*\d*\s*=\s* # P label (?P

[\d.]+)\s*% # P percentage """, re.VERBOSE, ) # Equipment model — first non-blank line that contains "Fischerscope" or # similar gauge identifier. Captures everything up to end of line. _EQUIPMENT_RE = re.compile( r'(Fischerscope[^\n\r]*)', re.IGNORECASE, ) # Product ref: "Product: 2805031 / NiP/Al-alloys 2805030" _PRODUCT_RE = re.compile( r'Product\s*:\s*([^\n\r]+?)(?:\s*$|\s*\n)', re.IGNORECASE | re.MULTILINE, ) # Calibration set: "Calibr. Std. Set NiP/Al STD SET SN 100174568" _CALIBR_RE = re.compile( r'Calibr\.?\s*Std\.?\s*Set\s*([^\n\r]+?)(?:\s*$|\s*\n)', re.IGNORECASE | re.MULTILINE, ) # Measuring time: "Measuring time 120 sec" _MEAS_TIME_RE = re.compile( r'Measuring\s*time\s*:?\s*(\d+)\s*sec', re.IGNORECASE, ) # Operator: "Operator: BK" (initials or short name) # Stop the capture at: 2+ whitespace, a newline, end-of-string, 2+ digits, # or end-of-line in multiline mode. The bare "Operator: BK\nDate: ..." # case (operator name immediately followed by newline + next field) was # the bug that fell through every other branch. _OPERATOR_RE = re.compile( r'Operator\s*:?\s*([A-Za-z][A-Za-z0-9 .\-]{0,40}?)(?=\s{2,}|\n|$|\s*\d{2,})', re.IGNORECASE | re.MULTILINE, ) # Date + Time: "Date: 5/15/2026 Time: 12:24:46 PM" _DATETIME_RE = re.compile( r'Date\s*:?\s*(\d{1,2}/\d{1,2}/\d{2,4})' r'\s*Time\s*:?\s*(\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)?)', re.IGNORECASE, ) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def parse_fischerscope_file(filename, content_bytes): """Parse a Fischerscope thickness report. Branches on file extension: .docx → python-docx (paragraphs + inline_shapes for the image) .pdf → PyPDF2 (text per page; image extraction best-effort) Returns: { 'success': bool, # True if at least one reading was parsed 'readings': [ # list of per-reading dicts {'reading_number': int, 'nip_mils': float, 'ni_percent': float, 'p_percent': float}, ... ], 'metadata': { # may have None values for missing keys 'equipment_model': str | None, 'product_ref': str | None, 'calibration_std_ref': str | None, 'measuring_time_seconds': int | None, 'operator_name': str | None, 'reading_datetime': datetime | None, }, 'image': bytes | None, # microscope image, if extractable 'image_mime': str | None, # image/jpeg, image/png, etc. 'raw_text': str, # extracted text (for debug / fallback) 'errors': [str], # non-fatal warnings encountered } Never raises on parse failure — returns success=False with readings=[]. Raises only on unrecoverable I/O (e.g. corrupted file bytes). """ name = (filename or '').lower() if name.endswith('.docx'): return _parse_docx(content_bytes) if name.endswith('.pdf'): return _parse_pdf(content_bytes) if name.endswith('.doc'): return _failed_result( raw_text='', error=( 'Legacy .doc format not supported — re-export from the ' 'gauge as .docx or .pdf. (python-docx reads .docx only; ' 'old binary .doc needs LibreOffice conversion which ' "isn't installed.)" ), ) return _failed_result( raw_text='', error='Unsupported file extension: %r. Expected .docx or .pdf.' % filename, ) # --------------------------------------------------------------------------- # Internals # --------------------------------------------------------------------------- def _parse_docx(content_bytes): """Parse a .docx Fischerscope report.""" errors = [] try: import docx # python-docx except ImportError: return _failed_result( raw_text='', error='python-docx not installed — cannot parse .docx files.', ) try: doc = docx.Document(io.BytesIO(content_bytes)) except Exception as e: return _failed_result(raw_text='', error='Could not open .docx: %s' % e) # Build the raw text by walking paragraphs AND tables. Fischerscope # exports vary — sometimes the readings are in a table, sometimes # in justified paragraphs. Joining everything gives the regex a # stable target. parts = [] for para in doc.paragraphs: text = para.text if text: parts.append(text) for tbl in doc.tables: for row in tbl.rows: row_text = ' '.join(cell.text for cell in row.cells) if row_text.strip(): parts.append(row_text) raw_text = '\n'.join(parts) # Image: walk inline_shapes + image-parts; pick the first one. The # Fischerscope export embeds exactly one microscope image per report. image_bytes = None image_mime = None try: for rel in doc.part.rels.values(): if 'image' in (rel.reltype or '').lower(): img_part = rel.target_part image_bytes = img_part.blob image_mime = img_part.content_type break except Exception as e: errors.append('image extraction failed: %s' % e) return _build_result(raw_text, errors, image_bytes, image_mime) def _parse_pdf(content_bytes): """Parse a .pdf Fischerscope report. Text-based PDFs only.""" errors = [] try: from PyPDF2 import PdfReader except ImportError: return _failed_result( raw_text='', error='PyPDF2 not installed — cannot parse .pdf files.', ) try: reader = PdfReader(io.BytesIO(content_bytes)) except Exception as e: return _failed_result(raw_text='', error='Could not open PDF: %s' % e) raw_text_parts = [] for i, page in enumerate(reader.pages): try: raw_text_parts.append(page.extract_text() or '') except Exception as e: errors.append('page %d extract_text failed: %s' % (i + 1, e)) raw_text = '\n'.join(raw_text_parts) # PDF image extraction is unreliable across PDF producers. Best- # effort: walk page resources looking for /XObject /Image entries. # If anything fails, drop image silently — the operator still has # the original file attached. image_bytes = None image_mime = None try: for page in reader.pages: resources = page.get('/Resources') if not resources: continue xobjects = resources.get('/XObject') if not xobjects: continue x_resolved = xobjects.get_object() if hasattr(xobjects, 'get_object') else xobjects for obj_name in x_resolved: obj = x_resolved[obj_name] obj = obj.get_object() if hasattr(obj, 'get_object') else obj if obj.get('/Subtype') == '/Image': image_bytes = obj.get_data() f = obj.get('/Filter') if f == '/DCTDecode': image_mime = 'image/jpeg' elif f == '/FlateDecode': image_mime = 'image/png' else: image_mime = 'application/octet-stream' break if image_bytes: break except Exception as e: errors.append('PDF image extraction failed: %s' % e) image_bytes = None return _build_result(raw_text, errors, image_bytes, image_mime) def _build_result(raw_text, errors, image_bytes, image_mime): """Run the regex extractor over raw_text and assemble the result dict.""" readings = [] for m in _READING_RE.finditer(raw_text): try: readings.append({ 'reading_number': int(m.group('n')), 'nip_mils': float(m.group('nip')), 'ni_percent': float(m.group('ni')), 'p_percent': float(m.group('p')), }) except (ValueError, TypeError) as e: errors.append('reading parse error at offset %d: %s' % (m.start(), e)) metadata = { 'equipment_model': _capture(_EQUIPMENT_RE, raw_text), 'product_ref': _capture(_PRODUCT_RE, raw_text), 'calibration_std_ref': _capture(_CALIBR_RE, raw_text), 'measuring_time_seconds': _capture_int(_MEAS_TIME_RE, raw_text), 'operator_name': _capture(_OPERATOR_RE, raw_text), 'reading_datetime': _capture_datetime(raw_text), } return { 'success': bool(readings), 'readings': readings, 'metadata': metadata, 'image': image_bytes, 'image_mime': image_mime, 'raw_text': raw_text, 'errors': errors, } def _failed_result(raw_text, error): return { 'success': False, 'readings': [], 'metadata': { 'equipment_model': None, 'product_ref': None, 'calibration_std_ref': None, 'measuring_time_seconds': None, 'operator_name': None, 'reading_datetime': None, }, 'image': None, 'image_mime': None, 'raw_text': raw_text, 'errors': [error] if error else [], } def _capture(rx, text): m = rx.search(text or '') if not m: return None val = m.group(1).strip() return val or None def _capture_int(rx, text): m = rx.search(text or '') if not m: return None try: return int(m.group(1)) except (ValueError, TypeError): return None def _capture_datetime(text): m = _DATETIME_RE.search(text or '') if not m: return None date_str, time_str = m.group(1).strip(), m.group(2).strip() # Try a few likely formats; the gauge can emit either MM/DD/YYYY or # M/D/YY plus 12h or 24h. for date_fmt in ('%m/%d/%Y', '%m/%d/%y', '%d/%m/%Y', '%d/%m/%y'): for time_fmt in ('%I:%M:%S %p', '%I:%M %p', '%H:%M:%S', '%H:%M'): try: return datetime.strptime('%s %s' % (date_str, time_str), '%s %s' % (date_fmt, time_fmt)) except ValueError: continue return None