"""Extract searchable tokens from Canadian bank statement memos. Handles common memo formats from RBC, TD, Scotia, BMO, plus generic cheque-number and reference-number patterns. Output is normalized (uppercase, alphanumeric) for case-insensitive matching. """ import re REF_PATTERNS = [ (re.compile(r'\b(REF|REFERENCE)\s*#?\s*(\d+)\b', re.I), r'REF\2'), (re.compile(r'\b(CHQ|CHEQUE|CHECK)\s*#?\s*(\d+)\b', re.I), r'CHEQUE\2'), (re.compile(r'\b(INV|INVOICE)\s*#?\s*(\d+)\b', re.I), r'INV\2'), ] MIN_TOKEN_LENGTH = 2 def tokenize_memo(memo: str | None) -> list[str]: """Return list of normalized tokens from a bank memo. Empty/None input returns []. Order preserved (first occurrence wins for de-duplication).""" if not memo: return [] text = memo.upper() for pattern, replacement in REF_PATTERNS: text = pattern.sub(replacement, text) text = re.sub(r'[^A-Z0-9]+', ' ', text) raw_tokens = text.split() seen = set() tokens = [] for tok in raw_tokens: if len(tok) < MIN_TOKEN_LENGTH: continue if tok in seen: continue seen.add(tok) tokens.append(tok) return tokens