Skip to content

Commit

Permalink
Add additional email normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
marselester committed Feb 12, 2024
1 parent 59d33c6 commit 471b7e5
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 17 deletions.
25 changes: 25 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,31 @@ History
* Added the following new values to the ``/payment/processor`` validation:
* ``pxp_financial``
* ``trustpay``
* Equivalent domain names are now normalized when `hash_address` is used.
For example, `googlemail.com` will become `gmail.com`.
* Periods are now removed from `gmail.com` email address local parts when
`hash_address` is used. For example, `f.o.o@gmail.com` will become
`foo@gmail.com`.
* Fastmail alias subdomain email addresses are now normalized when
`hash_address` is used. For example, `alias@user.fastmail.com` will
become `user@fastmail.com`.
* Additional `yahoo.com` email addresses now have aliases removed from
their local part when `hash_address` is used. For example,
`foo-bar@yahoo.com` will become `foo@yahoo.com` for additional
`yahoo.com` domains.
* Duplicate `.com`s are now removed from email domain names when
`hash_address` is used. For example, `example.com.com` will become
`example.com`.
* Extraneous characters after `.com` are now removed from email domain
names when `hash_address` is used. For example, `example.comfoo` will
become `example.com`.
* Certain `.com` typos are now normalized to `.com` when `hash_address` is
used. For example, `example.cam` will become `example.com`.
* Additional `gmail.com` domain names with leading digits are now
normalized when `hash_address` is used. For example, `100gmail.com` will
become `gmail.com`.
* Additional `gmail.com` typos are now normalized when `hash_address` is
used. For example, `gmali.com` will become `gmail.com`.

2.9.0 (2023-12-05)
++++++++++++++++++
Expand Down
248 changes: 232 additions & 16 deletions minfraud/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import re
import warnings
import hashlib
from typing import Any, Dict
Expand All @@ -15,17 +16,207 @@

_TYPO_DOMAINS = {
# gmail.com
"35gmai.com": "gmail.com",
"636gmail.com": "gmail.com",
"gmai.com": "gmail.com",
"gamil.com": "gmail.com",
"gmail.comu": "gmail.com",
"gmali.com": "gmail.com",
"gmial.com": "gmail.com",
"gmil.com": "gmail.com",
"gmaill.com": "gmail.com",
"gmailm.com": "gmail.com",
"gmailo.com": "gmail.com",
"gmailyhoo.com": "gmail.com",
"yahoogmail.com": "gmail.com",
# outlook.com
"putlook.com": "outlook.com",
}

_EQUIVALENT_DOMAINS = {
"googlemail.com" : "gmail.com",
"pm.me": "protonmail.com",
"proton.me": "protonmail.com",
"yandex.by": "yandex.ru",
"yandex.com": "yandex.ru",
"yandex.kz": "yandex.ru",
"yandex.ua": "yandex.ru",
"ya.ru": "yandex.ru",
}

_FASTMAIL_DOMAINS = {
"123mail.org",
"150mail.com",
"150ml.com",
"16mail.com",
"2-mail.com",
"4email.net",
"50mail.com",
"airpost.net",
"allmail.net",
"bestmail.us",
"cluemail.com",
"elitemail.org",
"emailcorner.net",
"emailengine.net",
"emailengine.org",
"emailgroups.net",
"emailplus.org",
"emailuser.net",
"eml.cc",
"f-m.fm",
"fast-email.com",
"fast-mail.org",
"fastem.com",
"fastemail.us",
"fastemailer.com",
"fastest.cc",
"fastimap.com",
"fastmail.cn",
"fastmail.co.uk",
"fastmail.com",
"fastmail.com.au",
"fastmail.de",
"fastmail.es",
"fastmail.fm",
"fastmail.fr",
"fastmail.im",
"fastmail.in",
"fastmail.jp",
"fastmail.mx",
"fastmail.net",
"fastmail.nl",
"fastmail.org",
"fastmail.se",
"fastmail.to",
"fastmail.tw",
"fastmail.uk",
"fastmail.us",
"fastmailbox.net",
"fastmessaging.com",
"fea.st",
"fmail.co.uk",
"fmailbox.com",
"fmgirl.com",
"fmguy.com",
"ftml.net",
"h-mail.us",
"hailmail.net",
"imap-mail.com",
"imap.cc",
"imapmail.org",
"inoutbox.com",
"internet-e-mail.com",
"internet-mail.org",
"internetemails.net",
"internetmailing.net",
"jetemail.net",
"justemail.net",
"letterboxes.org",
"mail-central.com",
"mail-page.com",
"mailandftp.com",
"mailas.com",
"mailbolt.com",
"mailc.net",
"mailcan.com",
"mailforce.net",
"mailftp.com",
"mailhaven.com",
"mailingaddress.org",
"mailite.com",
"mailmight.com",
"mailnew.com",
"mailsent.net",
"mailservice.ms",
"mailup.net",
"mailworks.org",
"ml1.net",
"mm.st",
"myfastmail.com",
"mymacmail.com",
"nospammail.net",
"ownmail.net",
"petml.com",
"postinbox.com",
"postpro.net",
"proinbox.com",
"promessage.com",
"realemail.net",
"reallyfast.biz",
"reallyfast.info",
"rushpost.com",
"sent.as",
"sent.at",
"sent.com",
"speedpost.net",
"speedymail.org",
"ssl-mail.com",
"swift-mail.com",
"the-fastest.net",
"the-quickest.com",
"theinternetemail.com",
"veryfast.biz",
"veryspeedy.net",
"warpmail.net",
"xsmail.com",
"yepmail.net",
"your-mail.com",
}

_YAHOO_DOMAINS = {
"y7mail.com",
"yahoo.at",
"yahoo.be",
"yahoo.bg",
"yahoo.ca",
"yahoo.cl",
"yahoo.co.id",
"yahoo.co.il",
"yahoo.co.in",
"yahoo.co.kr",
"yahoo.co.nz",
"yahoo.co.th",
"yahoo.co.uk",
"yahoo.co.za",
"yahoo.com",
"yahoo.com.ar",
"yahoo.com.au",
"yahoo.com.br",
"yahoo.com.co",
"yahoo.com.hk",
"yahoo.com.hr",
"yahoo.com.mx",
"yahoo.com.my",
"yahoo.com.pe",
"yahoo.com.ph",
"yahoo.com.sg",
"yahoo.com.tr",
"yahoo.com.tw",
"yahoo.com.ua",
"yahoo.com.ve",
"yahoo.com.vn",
"yahoo.cz",
"yahoo.de",
"yahoo.dk",
"yahoo.ee",
"yahoo.es",
"yahoo.fi",
"yahoo.fr",
"yahoo.gr",
"yahoo.hu",
"yahoo.ie",
"yahoo.in",
"yahoo.it",
"yahoo.lt",
"yahoo.lv",
"yahoo.nl",
"yahoo.no",
"yahoo.pl",
"yahoo.pt",
"yahoo.ro",
"yahoo.se",
"yahoo.sk",
"ymail.com",
}


def prepare_report(request: Dict[str, Any], validate: bool):
"""Validate and prepare minFraud report"""
Expand Down Expand Up @@ -91,29 +282,43 @@ def maybe_hash_email(transaction):
if address is None:
return

address = address.lower().strip()

at_idx = address.rfind("@")
if at_idx == -1:
address = _clean_email(address)
if not address:
return

domain = _clean_domain(address[at_idx + 1 :]) # noqa
local_part = address[:at_idx]

domain = address.split("@")[1]
if domain != "" and "domain" not in email:
email["domain"] = domain

email["address"] = _hash_email(local_part, domain)
email["address"] = hashlib.md5(address.encode("UTF-8")).hexdigest()


def _clean_domain(domain):
domain = domain.strip().rstrip(".").encode("idna").decode("ASCII")
return _TYPO_DOMAINS.get(domain, domain)

domain = re.sub(r"(?:\.com){2,}$", ".com", domain)
domain = re.sub(r"\.com[^.]+$", ".com", domain)
domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain)
domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain)

domain = _TYPO_DOMAINS.get(domain, domain)
domain = _EQUIVALENT_DOMAINS.get(domain, domain)

def _hash_email(local_part, domain):
# Strip off aliased part of email address
if domain == "yahoo.com":
return domain


def _clean_email(address):
address = address.lower().strip()

at_idx = address.rfind("@")
if at_idx == -1:
return

domain = _clean_domain(address[at_idx + 1 :]) # noqa
local_part = address[:at_idx]

# Strip off aliased part of email address.
if domain in _YAHOO_DOMAINS:
divider = "-"
else:
divider = "+"
Expand All @@ -122,4 +327,15 @@ def _hash_email(local_part, domain):
if alias_idx > 0:
local_part = local_part[:alias_idx]

return hashlib.md5(f"{local_part}@{domain}".encode("UTF-8")).hexdigest()
if domain == "gmail.com":
local_part = local_part.replace(".", "")

domain_parts = domain.split(".")
if len(domain_parts) > 2:
possible_domain = ".".join(domain_parts[1:])
if possible_domain in _FASTMAIL_DOMAINS:
domain = possible_domain
if local_part != "":
local_part = domain_parts[0]

return f"{local_part}@{domain}"
41 changes: 40 additions & 1 deletion tests/test_request.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import unittest

from minfraud.request import maybe_hash_email, clean_credit_card
from minfraud.request import (
maybe_hash_email,
clean_credit_card,
_clean_email,
)


class TestRequest(unittest.TestCase):
Expand Down Expand Up @@ -191,3 +195,38 @@ def test_clean_credit_card(self):
clean_credit_card(transaction)

self.assertEqual(test["expected"], transaction)


def test_clean_email():
tests = [
{"input": "", "output": None},
{"input": "fasfs", "output": None},
{"input": "test@gmail", "output": "test@gmail"},
{"input": "e4d909c290d0fb1ca068ffaddf22cbd0", "output": None},
{"input": "Test@maxmind", "output": "test@maxmind"},
{"input": "Test@maxmind.com", "output": "test@maxmind.com"},
{"input": "Test+007@maxmind.com", "output": "test@maxmind.com"},
{"input": "Test+007+008@maxmind.com", "output": "test@maxmind.com"},
{"input": "Test+@maxmind.com", "output": "test@maxmind.com"},
{"input": "+@maxmind.com", "output": "+@maxmind.com"},
{"input": " Test@maxmind.com", "output": "test@maxmind.com"},
{"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"},
{"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"},
{"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"},
{"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"},
{"input": "Test-foo@gmail.com", "output": "test-foo@gmail.com"},
{"input": "gamil.com@gamil.com", "output": "gamilcom@gmail.com"},
{"input": "Test+alias@bücher.com", "output": "test@xn--bcher-kva.com"},
{"input": "foo@googlemail.com", "output": "foo@gmail.com"},
{"input": "foo.bar@gmail.com", "output": "foobar@gmail.com"},
{"input": "alias@user.fastmail.com", "output": "user@fastmail.com"},
{"input": "foo-bar@ymail.com", "output": "foo@ymail.com"},
{"input": "foo@example.com.com", "output": "foo@example.com"},
{"input": "foo@example.comfoo", "output": "foo@example.com"},
{"input": "foo@example.cam", "output": "foo@example.com"},
{"input": "foo@10000gmail.com", "output": "foo@gmail.com"},
]

for test in tests:
got = _clean_email(test["input"])
assert test["output"] == got

0 comments on commit 471b7e5

Please sign in to comment.