From 471b7e5baa44fc8f96c50be8ce05772dfcd653c9 Mon Sep 17 00:00:00 2001 From: Marsel Mavletkulov Date: Mon, 12 Feb 2024 16:36:48 -0500 Subject: [PATCH] Add additional email normalization --- HISTORY.rst | 25 +++++ minfraud/request.py | 248 +++++++++++++++++++++++++++++++++++++++--- tests/test_request.py | 41 ++++++- 3 files changed, 297 insertions(+), 17 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 1ca5631..466469d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -9,6 +9,31 @@ History * Added the following new values to the ``/payment/processor`` validation: * ``pxp_financial`` * ``trustpay`` +* Equivalent domain names are now normalized when `hash_address` is used. + For example, `googlemail.com` will become `gmail.com`. +* Periods are now removed from `gmail.com` email address local parts when + `hash_address` is used. For example, `f.o.o@gmail.com` will become + `foo@gmail.com`. +* Fastmail alias subdomain email addresses are now normalized when + `hash_address` is used. For example, `alias@user.fastmail.com` will + become `user@fastmail.com`. +* Additional `yahoo.com` email addresses now have aliases removed from + their local part when `hash_address` is used. For example, + `foo-bar@yahoo.com` will become `foo@yahoo.com` for additional + `yahoo.com` domains. +* Duplicate `.com`s are now removed from email domain names when + `hash_address` is used. For example, `example.com.com` will become + `example.com`. +* Extraneous characters after `.com` are now removed from email domain + names when `hash_address` is used. For example, `example.comfoo` will + become `example.com`. +* Certain `.com` typos are now normalized to `.com` when `hash_address` is + used. For example, `example.cam` will become `example.com`. +* Additional `gmail.com` domain names with leading digits are now + normalized when `hash_address` is used. For example, `100gmail.com` will + become `gmail.com`. +* Additional `gmail.com` typos are now normalized when `hash_address` is + used. For example, `gmali.com` will become `gmail.com`. 2.9.0 (2023-12-05) ++++++++++++++++++ diff --git a/minfraud/request.py b/minfraud/request.py index e0d12df..15cf2fd 100644 --- a/minfraud/request.py +++ b/minfraud/request.py @@ -5,6 +5,7 @@ """ +import re import warnings import hashlib from typing import Any, Dict @@ -15,17 +16,207 @@ _TYPO_DOMAINS = { # gmail.com - "35gmai.com": "gmail.com", - "636gmail.com": "gmail.com", + "gmai.com": "gmail.com", "gamil.com": "gmail.com", - "gmail.comu": "gmail.com", + "gmali.com": "gmail.com", "gmial.com": "gmail.com", "gmil.com": "gmail.com", + "gmaill.com": "gmail.com", + "gmailm.com": "gmail.com", + "gmailo.com": "gmail.com", + "gmailyhoo.com": "gmail.com", "yahoogmail.com": "gmail.com", # outlook.com "putlook.com": "outlook.com", } +_EQUIVALENT_DOMAINS = { + "googlemail.com" : "gmail.com", + "pm.me": "protonmail.com", + "proton.me": "protonmail.com", + "yandex.by": "yandex.ru", + "yandex.com": "yandex.ru", + "yandex.kz": "yandex.ru", + "yandex.ua": "yandex.ru", + "ya.ru": "yandex.ru", +} + +_FASTMAIL_DOMAINS = { + "123mail.org", + "150mail.com", + "150ml.com", + "16mail.com", + "2-mail.com", + "4email.net", + "50mail.com", + "airpost.net", + "allmail.net", + "bestmail.us", + "cluemail.com", + "elitemail.org", + "emailcorner.net", + "emailengine.net", + "emailengine.org", + "emailgroups.net", + "emailplus.org", + "emailuser.net", + "eml.cc", + "f-m.fm", + "fast-email.com", + "fast-mail.org", + "fastem.com", + "fastemail.us", + "fastemailer.com", + "fastest.cc", + "fastimap.com", + "fastmail.cn", + "fastmail.co.uk", + "fastmail.com", + "fastmail.com.au", + "fastmail.de", + "fastmail.es", + "fastmail.fm", + "fastmail.fr", + "fastmail.im", + "fastmail.in", + "fastmail.jp", + "fastmail.mx", + "fastmail.net", + "fastmail.nl", + "fastmail.org", + "fastmail.se", + "fastmail.to", + "fastmail.tw", + "fastmail.uk", + "fastmail.us", + "fastmailbox.net", + "fastmessaging.com", + "fea.st", + "fmail.co.uk", + "fmailbox.com", + "fmgirl.com", + "fmguy.com", + "ftml.net", + "h-mail.us", + "hailmail.net", + "imap-mail.com", + "imap.cc", + "imapmail.org", + "inoutbox.com", + "internet-e-mail.com", + "internet-mail.org", + "internetemails.net", + "internetmailing.net", + "jetemail.net", + "justemail.net", + "letterboxes.org", + "mail-central.com", + "mail-page.com", + "mailandftp.com", + "mailas.com", + "mailbolt.com", + "mailc.net", + "mailcan.com", + "mailforce.net", + "mailftp.com", + "mailhaven.com", + "mailingaddress.org", + "mailite.com", + "mailmight.com", + "mailnew.com", + "mailsent.net", + "mailservice.ms", + "mailup.net", + "mailworks.org", + "ml1.net", + "mm.st", + "myfastmail.com", + "mymacmail.com", + "nospammail.net", + "ownmail.net", + "petml.com", + "postinbox.com", + "postpro.net", + "proinbox.com", + "promessage.com", + "realemail.net", + "reallyfast.biz", + "reallyfast.info", + "rushpost.com", + "sent.as", + "sent.at", + "sent.com", + "speedpost.net", + "speedymail.org", + "ssl-mail.com", + "swift-mail.com", + "the-fastest.net", + "the-quickest.com", + "theinternetemail.com", + "veryfast.biz", + "veryspeedy.net", + "warpmail.net", + "xsmail.com", + "yepmail.net", + "your-mail.com", +} + +_YAHOO_DOMAINS = { + "y7mail.com", + "yahoo.at", + "yahoo.be", + "yahoo.bg", + "yahoo.ca", + "yahoo.cl", + "yahoo.co.id", + "yahoo.co.il", + "yahoo.co.in", + "yahoo.co.kr", + "yahoo.co.nz", + "yahoo.co.th", + "yahoo.co.uk", + "yahoo.co.za", + "yahoo.com", + "yahoo.com.ar", + "yahoo.com.au", + "yahoo.com.br", + "yahoo.com.co", + "yahoo.com.hk", + "yahoo.com.hr", + "yahoo.com.mx", + "yahoo.com.my", + "yahoo.com.pe", + "yahoo.com.ph", + "yahoo.com.sg", + "yahoo.com.tr", + "yahoo.com.tw", + "yahoo.com.ua", + "yahoo.com.ve", + "yahoo.com.vn", + "yahoo.cz", + "yahoo.de", + "yahoo.dk", + "yahoo.ee", + "yahoo.es", + "yahoo.fi", + "yahoo.fr", + "yahoo.gr", + "yahoo.hu", + "yahoo.ie", + "yahoo.in", + "yahoo.it", + "yahoo.lt", + "yahoo.lv", + "yahoo.nl", + "yahoo.no", + "yahoo.pl", + "yahoo.pt", + "yahoo.ro", + "yahoo.se", + "yahoo.sk", + "ymail.com", +} + def prepare_report(request: Dict[str, Any], validate: bool): """Validate and prepare minFraud report""" @@ -91,29 +282,43 @@ def maybe_hash_email(transaction): if address is None: return - address = address.lower().strip() - - at_idx = address.rfind("@") - if at_idx == -1: + address = _clean_email(address) + if not address: return - domain = _clean_domain(address[at_idx + 1 :]) # noqa - local_part = address[:at_idx] - + domain = address.split("@")[1] if domain != "" and "domain" not in email: email["domain"] = domain - email["address"] = _hash_email(local_part, domain) + email["address"] = hashlib.md5(address.encode("UTF-8")).hexdigest() def _clean_domain(domain): domain = domain.strip().rstrip(".").encode("idna").decode("ASCII") - return _TYPO_DOMAINS.get(domain, domain) + domain = re.sub(r"(?:\.com){2,}$", ".com", domain) + domain = re.sub(r"\.com[^.]+$", ".com", domain) + domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain) + domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain) + + domain = _TYPO_DOMAINS.get(domain, domain) + domain = _EQUIVALENT_DOMAINS.get(domain, domain) -def _hash_email(local_part, domain): - # Strip off aliased part of email address - if domain == "yahoo.com": + return domain + + +def _clean_email(address): + address = address.lower().strip() + + at_idx = address.rfind("@") + if at_idx == -1: + return + + domain = _clean_domain(address[at_idx + 1 :]) # noqa + local_part = address[:at_idx] + + # Strip off aliased part of email address. + if domain in _YAHOO_DOMAINS: divider = "-" else: divider = "+" @@ -122,4 +327,15 @@ def _hash_email(local_part, domain): if alias_idx > 0: local_part = local_part[:alias_idx] - return hashlib.md5(f"{local_part}@{domain}".encode("UTF-8")).hexdigest() + if domain == "gmail.com": + local_part = local_part.replace(".", "") + + domain_parts = domain.split(".") + if len(domain_parts) > 2: + possible_domain = ".".join(domain_parts[1:]) + if possible_domain in _FASTMAIL_DOMAINS: + domain = possible_domain + if local_part != "": + local_part = domain_parts[0] + + return f"{local_part}@{domain}" diff --git a/tests/test_request.py b/tests/test_request.py index b55ee69..c02639c 100644 --- a/tests/test_request.py +++ b/tests/test_request.py @@ -1,6 +1,10 @@ import unittest -from minfraud.request import maybe_hash_email, clean_credit_card +from minfraud.request import ( + maybe_hash_email, + clean_credit_card, + _clean_email, +) class TestRequest(unittest.TestCase): @@ -191,3 +195,38 @@ def test_clean_credit_card(self): clean_credit_card(transaction) self.assertEqual(test["expected"], transaction) + + +def test_clean_email(): + tests = [ + {"input": "", "output": None}, + {"input": "fasfs", "output": None}, + {"input": "test@gmail", "output": "test@gmail"}, + {"input": "e4d909c290d0fb1ca068ffaddf22cbd0", "output": None}, + {"input": "Test@maxmind", "output": "test@maxmind"}, + {"input": "Test@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+007@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+007+008@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test+@maxmind.com", "output": "test@maxmind.com"}, + {"input": "+@maxmind.com", "output": "+@maxmind.com"}, + {"input": " Test@maxmind.com", "output": "test@maxmind.com"}, + {"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"}, + {"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"}, + {"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"}, + {"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"}, + {"input": "Test-foo@gmail.com", "output": "test-foo@gmail.com"}, + {"input": "gamil.com@gamil.com", "output": "gamilcom@gmail.com"}, + {"input": "Test+alias@bücher.com", "output": "test@xn--bcher-kva.com"}, + {"input": "foo@googlemail.com", "output": "foo@gmail.com"}, + {"input": "foo.bar@gmail.com", "output": "foobar@gmail.com"}, + {"input": "alias@user.fastmail.com", "output": "user@fastmail.com"}, + {"input": "foo-bar@ymail.com", "output": "foo@ymail.com"}, + {"input": "foo@example.com.com", "output": "foo@example.com"}, + {"input": "foo@example.comfoo", "output": "foo@example.com"}, + {"input": "foo@example.cam", "output": "foo@example.com"}, + {"input": "foo@10000gmail.com", "output": "foo@gmail.com"}, + ] + + for test in tests: + got = _clean_email(test["input"]) + assert test["output"] == got