From 07a877da66eababf7be4c472459bb03ab5016f77 Mon Sep 17 00:00:00 2001 From: Di Wu Date: Thu, 6 Aug 2020 06:46:06 -0700 Subject: [PATCH] Add ABA routing number recognizer (#348) * aba routing number with checksum digit * Update test_aba_routing_recognizer.py Co-authored-by: Omri Mendels --- .../predefined_recognizers/__init__.py | 6 +- .../aba_routing_recognizer.py | 60 +++++++++++++++++++ .../tests/data/context_sentences_tests.txt | 5 +- .../tests/test_aba_routing_recognizer.py | 38 ++++++++++++ .../tests/test_context_support.py | 13 ++-- 5 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py create mode 100644 presidio-analyzer/tests/test_aba_routing_recognizer.py diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index c70135c8c..f8285c6e5 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -1,3 +1,4 @@ +from .aba_routing_recognizer import AbaRoutingRecognizer from .credit_card_recognizer import CreditCardRecognizer from .crypto_recognizer import CryptoRecognizer from .domain_recognizer import DomainRecognizer @@ -18,19 +19,20 @@ NLP_RECOGNIZERS = {"spacy": SpacyRecognizer, "stanza": StanzaRecognizer} __all__ = [ + "AbaRoutingRecognizer", "CreditCardRecognizer", "CryptoRecognizer", "DomainRecognizer", "EmailRecognizer", "IbanRecognizer", "IpRecognizer", + "NhsRecognizer", "SgFinRecognizer", "SpacyRecognizer", "StanzaRecognizer", - "NhsRecognizer", "UsBankRecognizer", - "UsLicenseRecognizer", "UsItinRecognizer", + "UsLicenseRecognizer", "UsPassportRecognizer", "UsPhoneRecognizer", "UsSsnRecognizer", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py new file mode 100644 index 000000000..6e34306f5 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py @@ -0,0 +1,60 @@ +from presidio_analyzer import Pattern, PatternRecognizer + + +class AbaRoutingRecognizer(PatternRecognizer): + """ + Recognizes American Banking Association (ABA) routing number. + Also known as routing transit number (RTN) and used to identify financial + institutions and process transactions. + """ + + PATTERNS = [ + Pattern("ABA routing number (weak)", r"\b[0123678]\d{8}\b", 0.05,), + Pattern("ABA routing number", + r"\b[0123678]\d{3}-\d{4}-\d\b", + 0.3,), + ] + + CONTEXT = [ + "aba", + "routing", + "abarouting", + "association", + "bankrouting", + ] + + def __init__( + self, + patterns=None, + context=None, + supported_language="en", + supported_entity="ABA_ROUTING_NUMBER", + replacement_pairs=None, + ): + self.replacement_pairs = replacement_pairs or [("-", "")] + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text): + sanitized_value = self.__sanitize_value(pattern_text, + self.replacement_pairs) + return self.__checksum(sanitized_value) + + @staticmethod + def __checksum(sanitized_value): + s = 0 + for idx, m in enumerate([3, 7, 1, 3, 7, 1, 3, 7, 1]): + s += int(sanitized_value[idx]) * m + return s % 10 == 0 + + @staticmethod + def __sanitize_value(text, replacement_pairs): + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/tests/data/context_sentences_tests.txt b/presidio-analyzer/tests/data/context_sentences_tests.txt index f8f424327..75c553bfc 100644 --- a/presidio-analyzer/tests/data/context_sentences_tests.txt +++ b/presidio-analyzer/tests/data/context_sentences_tests.txt @@ -60,6 +60,9 @@ my DRIVER LICENSE is: 7774567901234 US_DRIVER_LICENSE my DrIvEr LiCeNsE is: 7774567901234 +ABA_ROUTING_NUMBER +routing number is: 101205681 + US_BANK_NUMBER my bank account number is 912803456 @@ -90,4 +93,4 @@ Special NRIC numbers e.g. S0000001I that are numerically significant have been i # Verify SG NRIC/FIN mixed case (e.g. lower case ) FIN -my fin is g3300299L \ No newline at end of file +my fin is g3300299L diff --git a/presidio-analyzer/tests/test_aba_routing_recognizer.py b/presidio-analyzer/tests/test_aba_routing_recognizer.py new file mode 100644 index 000000000..3f9c2c397 --- /dev/null +++ b/presidio-analyzer/tests/test_aba_routing_recognizer.py @@ -0,0 +1,38 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import AbaRoutingRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return AbaRoutingRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["ABA_ROUTING_NUMBER"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions, expected_score", + [ + # Bank of America + ("121000358", 1, ((0, 9),), 1.0), + # Chase + ("3222-7162-7", 1, ((0, 11),), 1.0), + # Wells Fargo + ("121042882", 1, ((0, 9),), 1.0), + ("0711-0130-7", 1, ((0, 11),), 1.0), + # invalid ABA numbers + ("421042111", 0, (), -1.0), + ("1234-0000-0", 0, (), -1.0), + ], +) +def test_aba_routing_numbers( + text, expected_len, expected_positions, expected_score, recognizer, entities +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result(res, entities[0], st_pos, fn_pos, expected_score) diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 64de71d3f..bf521c540 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -3,6 +3,7 @@ from presidio_analyzer import PatternRecognizer, Pattern from presidio_analyzer.predefined_recognizers import ( + AbaRoutingRecognizer, # CreditCardRecognizer, UsPhoneRecognizer, # DomainRecognizer, @@ -23,6 +24,7 @@ def recognizers(): "IP_ADDRESS": IpRecognizer(), "US_SSN": UsSsnRecognizer(), "PHONE_NUMBER": UsPhoneRecognizer(), + "ABA_ROUTING_NUMBER": AbaRoutingRecognizer(), "US_ITIN": UsItinRecognizer(), "US_DRIVER_LICENSE": UsLicenseRecognizer(), "US_BANK_NUMBER": UsBankRecognizer(), @@ -58,9 +60,9 @@ def dataset(recognizers): raise ValueError(f"bad entity type {entity_type}") test_items.append((item, recognizer, [entity_type])) - # Currently we have 27 sentences, this is a sanity check - if not len(test_items) == 27: - raise ValueError(f"expected 27 context sentences but found {len(test_items)}") + # Currently we have 28 sentences, this is a sanity check + if not len(test_items) == 28: + raise ValueError(f"expected 28 context sentences but found {len(test_items)}") yield test_items @@ -79,7 +81,10 @@ def test_text_with_context_improves_score(dataset, nlp_engine, mock_nlp_artifact assert len(results_without_context) == len(results_with_context) for res_wo, res_w in zip(results_without_context, results_with_context): - assert res_wo.score < res_w.score + if res_wo.score != 1.0: + assert res_wo.score < res_w.score + else: + assert res_wo.score <= res_w.score def test_context_custom_recognizer(nlp_engine, mock_nlp_artifacts):