diff --git a/fusion_accounting_bank_rec/services/__init__.py b/fusion_accounting_bank_rec/services/__init__.py index e69de29b..25962b99 100644 --- a/fusion_accounting_bank_rec/services/__init__.py +++ b/fusion_accounting_bank_rec/services/__init__.py @@ -0,0 +1 @@ +from . import memo_tokenizer diff --git a/fusion_accounting_bank_rec/services/memo_tokenizer.py b/fusion_accounting_bank_rec/services/memo_tokenizer.py new file mode 100644 index 00000000..92166995 --- /dev/null +++ b/fusion_accounting_bank_rec/services/memo_tokenizer.py @@ -0,0 +1,44 @@ +"""Extract searchable tokens from Canadian bank statement memos. + +Handles common memo formats from RBC, TD, Scotia, BMO, plus generic +cheque-number and reference-number patterns. Output is normalized +(uppercase, alphanumeric) for case-insensitive matching. +""" + +import re + +REF_PATTERNS = [ + (re.compile(r'\b(REF|REFERENCE)\s*#?\s*(\d+)\b', re.I), r'REF\2'), + (re.compile(r'\b(CHQ|CHEQUE|CHECK)\s*#?\s*(\d+)\b', re.I), r'CHEQUE\2'), + (re.compile(r'\b(INV|INVOICE)\s*#?\s*(\d+)\b', re.I), r'INV\2'), +] + +MIN_TOKEN_LENGTH = 2 + + +def tokenize_memo(memo: str | None) -> list[str]: + """Return list of normalized tokens from a bank memo. + + Empty/None input returns []. Order preserved (first occurrence wins + for de-duplication).""" + if not memo: + return [] + + text = memo.upper() + for pattern, replacement in REF_PATTERNS: + text = pattern.sub(replacement, text) + + text = re.sub(r'[^A-Z0-9]+', ' ', text) + raw_tokens = text.split() + + seen = set() + tokens = [] + for tok in raw_tokens: + if len(tok) < MIN_TOKEN_LENGTH: + continue + if tok in seen: + continue + seen.add(tok) + tokens.append(tok) + + return tokens diff --git a/fusion_accounting_bank_rec/tests/__init__.py b/fusion_accounting_bank_rec/tests/__init__.py index e69de29b..2769b296 100644 --- a/fusion_accounting_bank_rec/tests/__init__.py +++ b/fusion_accounting_bank_rec/tests/__init__.py @@ -0,0 +1 @@ +from . import test_memo_tokenizer diff --git a/fusion_accounting_bank_rec/tests/test_memo_tokenizer.py b/fusion_accounting_bank_rec/tests/test_memo_tokenizer.py new file mode 100644 index 00000000..1121dc81 --- /dev/null +++ b/fusion_accounting_bank_rec/tests/test_memo_tokenizer.py @@ -0,0 +1,42 @@ +from odoo.tests.common import TransactionCase, tagged +from odoo.addons.fusion_accounting_bank_rec.services.memo_tokenizer import tokenize_memo + + +@tagged('post_install', '-at_install') +class TestMemoTokenizer(TransactionCase): + + def test_extracts_rbc_etf_reference(self): + tokens = tokenize_memo("RBC ETF DEP REF 4831") + self.assertIn('RBC', tokens) + self.assertIn('ETF', tokens) + self.assertIn('REF4831', tokens) + + def test_extracts_cheque_number(self): + tokens = tokenize_memo("CHEQUE 4827 - WESTIN PLATING") + self.assertIn('CHEQUE4827', tokens) + self.assertIn('WESTIN', tokens) + self.assertIn('PLATING', tokens) + + def test_strips_noise_tokens(self): + tokens = tokenize_memo("PAYMENT - INV - DEP - 12345") + self.assertNotIn('-', tokens) + self.assertEqual([t for t in tokens if len(t) <= 1], []) + + def test_handles_empty_memo(self): + self.assertEqual(tokenize_memo(""), []) + self.assertEqual(tokenize_memo(None), []) + + def test_canadian_french_memo(self): + tokens = tokenize_memo("PAIEMENT VIREMENT BANCAIRE") + self.assertIn('PAIEMENT', tokens) + self.assertIn('VIREMENT', tokens) + + def test_normalises_case(self): + tokens = tokenize_memo("rbc etf dep ref 4831") + self.assertIn('RBC', tokens) + + def test_handles_special_characters(self): + tokens = tokenize_memo("RBC*PAYMENT/REF#4831") + self.assertIn('RBC', tokens) + self.assertIn('PAYMENT', tokens) + self.assertIn('REF4831', tokens)