feat(fusion_accounting_bank_rec): memo_tokenizer for Canadian bank memo formats
Made-with: Cursor
This commit is contained in:
@@ -0,0 +1 @@
|
||||
from . import memo_tokenizer
|
||||
|
||||
44
fusion_accounting_bank_rec/services/memo_tokenizer.py
Normal file
44
fusion_accounting_bank_rec/services/memo_tokenizer.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Extract searchable tokens from Canadian bank statement memos.
|
||||
|
||||
Handles common memo formats from RBC, TD, Scotia, BMO, plus generic
|
||||
cheque-number and reference-number patterns. Output is normalized
|
||||
(uppercase, alphanumeric) for case-insensitive matching.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
REF_PATTERNS = [
|
||||
(re.compile(r'\b(REF|REFERENCE)\s*#?\s*(\d+)\b', re.I), r'REF\2'),
|
||||
(re.compile(r'\b(CHQ|CHEQUE|CHECK)\s*#?\s*(\d+)\b', re.I), r'CHEQUE\2'),
|
||||
(re.compile(r'\b(INV|INVOICE)\s*#?\s*(\d+)\b', re.I), r'INV\2'),
|
||||
]
|
||||
|
||||
MIN_TOKEN_LENGTH = 2
|
||||
|
||||
|
||||
def tokenize_memo(memo: str | None) -> list[str]:
|
||||
"""Return list of normalized tokens from a bank memo.
|
||||
|
||||
Empty/None input returns []. Order preserved (first occurrence wins
|
||||
for de-duplication)."""
|
||||
if not memo:
|
||||
return []
|
||||
|
||||
text = memo.upper()
|
||||
for pattern, replacement in REF_PATTERNS:
|
||||
text = pattern.sub(replacement, text)
|
||||
|
||||
text = re.sub(r'[^A-Z0-9]+', ' ', text)
|
||||
raw_tokens = text.split()
|
||||
|
||||
seen = set()
|
||||
tokens = []
|
||||
for tok in raw_tokens:
|
||||
if len(tok) < MIN_TOKEN_LENGTH:
|
||||
continue
|
||||
if tok in seen:
|
||||
continue
|
||||
seen.add(tok)
|
||||
tokens.append(tok)
|
||||
|
||||
return tokens
|
||||
Reference in New Issue
Block a user