Skip to content

Commit

Permalink
fix typo in amazon domains
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Jul 30, 2024
1 parent 51b1104 commit f18c190
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 4 deletions.
8 changes: 4 additions & 4 deletions duplicate_url_discarder_rules/queryRemovalExcept/main.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
"amazon.com/dp",
"amazon.de/*/dp",
"amazon.de/dp",
"amazon.e/*/dp",
"amazon.e/dp",
"amazon.es/*/dp",
"amazon.es/dp",
"amazon.fr/*/dp",
"amazon.fr/dp",
"amazon.in/*/dp",
Expand All @@ -39,8 +39,8 @@
"amazon.it/dp",
"amazon.nl/*/dp",
"amazon.nl/dp",
"amazon.p/*/dp",
"amazon.p/dp",
"amazon.pt/*/dp",
"amazon.pt/dp",
"amazon.sa/*/dp",
"amazon.sa/dp",
"amazon.se/*/dp",
Expand Down
2 changes: 2 additions & 0 deletions duplicate_url_discarder_rules/subpathRemoval/main.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
"amazon.com.tr/*/dp",
"amazon.com/*/dp",
"amazon.de/*/dp",
"amazon.es/*/dp",
"amazon.fr/*/dp",
"amazon.in/*/dp",
"amazon.it/*/dp",
"amazon.nl/*/dp",
"amazon.pt/*/dp",
"amazon.sa/*/dp",
"amazon.se/*/dp"
]
Expand Down
55 changes: 55 additions & 0 deletions tests/queryRemovalExcept/test_query_removal_except_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from duplicate_url_discarder.url_canonicalizer import UrlCanonicalizer
from duplicate_url_discarder.processors import QueryRemovalExceptProcessor

from duplicate_url_discarder_rules import RULE_PATHS


def test_query_removal_except_main_rules():
rule_path = [
path for path in RULE_PATHS if path.endswith("queryRemovalExcept/main.json")
]
assert len(rule_path) == 1

canonicalizer = UrlCanonicalizer(rule_path)

assert len(canonicalizer.processors) == 1
assert isinstance(
list(canonicalizer.processors.values())[0], QueryRemovalExceptProcessor
)

domains = [
"ae",
"ca",
"cn",
"co.jp",
"co.uk",
"com.au",
"com.br",
"com.mx",
"com.sg",
"com.tr",
"com",
"de",
"es",
"fr",
"in",
"it",
"nl",
"pt",
"sa",
"se",
]

for domain in domains:
assert (
canonicalizer.process_url(
f"https://www.amazon.{domain}/some-text/dp/ASIN?p=1&q=2#frag"
)
== f"https://www.amazon.{domain}/some-text/dp/ASIN#frag"
)
assert (
canonicalizer.process_url(
f"https://www.amazon.{domain}/dp/ASIN?p=1&q=2#frag"
)
== f"https://www.amazon.{domain}/dp/ASIN#frag"
)
2 changes: 2 additions & 0 deletions tests/subpathRemoval/test_subpath_removal_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ def test_subpath_removal_main_rules():
"com.tr",
"com",
"de",
"es",
"fr",
"in",
"it",
"nl",
"pt",
"sa",
"se",
]
Expand Down

0 comments on commit f18c190

Please sign in to comment.