From 78484a0a6559ee5343efdd2cc48334186a257234 Mon Sep 17 00:00:00 2001 From: Barthelemy Dagenais Date: Sun, 22 Jan 2017 20:03:50 -0500 Subject: [PATCH 1/4] refs #16 - Added --ignore-bad-tel-urls. --- README.rst | 3 +++ pylinkvalidator/crawler.py | 7 +++++-- pylinkvalidator/models.py | 14 +++++++++++--- pylinkvalidator/testfiles/badtel.html | 8 ++++++++ pylinkvalidator/tests.py | 13 +++++++++++++ pylinkvalidator/urlutil.py | 19 +++++++++++++++++++ 6 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 pylinkvalidator/testfiles/badtel.html diff --git a/README.rst b/README.rst index 309fdbd..a4635c9 100644 --- a/README.rst +++ b/README.rst @@ -105,6 +105,9 @@ usage examples. -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES Comma-separated list of host/path prefixes to ignore (e.g., www.example.com/ignore_this_and_after/) + -b, --ignore-bad-tel-urls + ignore badly formed tel URLs missing the leading + + sign, e.g., tel:1234567890 -u USERNAME, --username=USERNAME Username to use with basic HTTP authentication -p PASSWORD, --password=PASSWORD diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py index ddd776f..4818814 100644 --- a/pylinkvalidator/crawler.py +++ b/pylinkvalidator/crawler.py @@ -25,7 +25,7 @@ from pylinkvalidator.reporter import report from pylinkvalidator.urlutil import ( get_clean_url_split, get_absolute_url_split, - is_link, SUPPORTED_SCHEMES, is_similar_url_split) + is_link, is_similar_url_split, is_supported_scheme) WORK_DONE = '__WORK_DONE__' @@ -533,7 +533,10 @@ def _get_links(self, elements, attribute, base_url_split, continue abs_url_split = get_absolute_url_split(url, base_url_split) - if abs_url_split.scheme not in SUPPORTED_SCHEMES: + if not is_supported_scheme( + abs_url_split, self.worker_config.ignore_bad_tel_urls): + print("CONTINUE") + print(abs_url_split) continue link = Link( diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index bb54b50..84bfd5e 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -14,7 +14,9 @@ from pylinkvalidator.included.bs4 import BeautifulSoup from pylinkvalidator.compat import get_safe_str -from pylinkvalidator.urlutil import get_clean_url_split, get_absolute_url_split +from pylinkvalidator.urlutil import ( + get_clean_url_split, get_absolute_url_split) + PREFIX_ALL = "*" @@ -112,7 +114,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]): WorkerConfig = namedtuple_with_defaults( "WorkerConfig", ["username", "password", "types", "timeout", "parser", "strict_mode", - "prefer_server_encoding", "extra_headers"]) + "prefer_server_encoding", "extra_headers", "ignore_bad_tel_urls"]) WorkerInput = namedtuple_with_defaults( @@ -307,7 +309,8 @@ def _build_worker_config(self, options): return WorkerConfig( options.username, options.password, types, options.timeout, options.parser, options.strict_mode, - options.prefer_server_encoding, headers) + options.prefer_server_encoding, headers, + options.ignore_bad_tel_urls) def _build_accepted_hosts(self, options, start_urls): if options.multi: @@ -462,6 +465,11 @@ def _build_parser(self): action="store", default=None, help="comma-separated list of host/path prefixes to ignore " "(e.g., www.example.com/ignore_this_and_after/)") + crawler_group.add_option( + "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls", + action="store_true", default=False, + help="ignore badly formed tel URLs missing the leading + sign, " + "e.g., tel:1234567890") crawler_group.add_option( "-u", "--username", dest="username", action="store", default=None, diff --git a/pylinkvalidator/testfiles/badtel.html b/pylinkvalidator/testfiles/badtel.html new file mode 100644 index 0000000..a7ab22e --- /dev/null +++ b/pylinkvalidator/testfiles/badtel.html @@ -0,0 +1,8 @@ + + +

Go to next page +

Go to good tel link +

Go to bad tel link +

+ + diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index 8630a11..12341c6 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -478,3 +478,16 @@ def test_unicode(self): # 3 pages linked on the root (root, 0, 0b) self.assertEqual(2, len(site.pages)) self.assertEqual(0, len(site.error_pages)) + + def test_bad_tel_link(self): + site = self._run_crawler_plain( + ThreadSiteCrawler, ["--ignore-bad-tel-urls"], "/badtel.html") + # root + one page linked. bad tel link and tel link are ignored. + self.assertEqual(2, len(site.pages)) + self.assertEqual(0, len(site.error_pages)) + + site = self._run_crawler_plain( + ThreadSiteCrawler, [], "/badtel.html") + # root + one page + one bad tel link. One correct tel link ignored + self.assertEqual(3, len(site.pages)) + self.assertEqual(1, len(site.error_pages)) diff --git a/pylinkvalidator/urlutil.py b/pylinkvalidator/urlutil.py index b85b9c5..1ed1e0a 100644 --- a/pylinkvalidator/urlutil.py +++ b/pylinkvalidator/urlutil.py @@ -107,3 +107,22 @@ def is_similar_url_split(url_split_1, url_split_2): else: return url_split_1.path == url_split_2.path and\ url_split_1.netloc == url_split_2.netloc + + +def is_bad_tel_url_split(url_split): + """Returns True if the URL is using a badly formed tel scheme + that is not detected by Python urlparse. + """ + return url_split.netloc.startswith("tel:") or\ + url_split.path.startswith("/tel:") + + +def is_supported_scheme(url_split, ignore_bad_tel_urls=False): + """Returns True if the URL has a supported scheme and can be crawled. + """ + if url_split.scheme not in SUPPORTED_SCHEMES: + return False + elif ignore_bad_tel_urls and is_bad_tel_url_split(url_split): + # issue #16 + return False + return True From 7836af0daaccf41f681f1824b33d35b4b931b248 Mon Sep 17 00:00:00 2001 From: Barthelemy Dagenais Date: Sun, 22 Jan 2017 20:12:02 -0500 Subject: [PATCH 2/4] refs #16 - fix test to be compatible with Python 2.6 --- README.rst | 3 ++- pylinkvalidator/crawler.py | 2 -- pylinkvalidator/models.py | 2 +- pylinkvalidator/tests.py | 11 ++++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index a4635c9..c7a9730 100644 --- a/README.rst +++ b/README.rst @@ -107,7 +107,8 @@ usage examples. (e.g., www.example.com/ignore_this_and_after/) -b, --ignore-bad-tel-urls ignore badly formed tel URLs missing the leading + - sign, e.g., tel:1234567890 + sign, e.g., tel:1234567890 - only necessary for Python + > 2.6 -u USERNAME, --username=USERNAME Username to use with basic HTTP authentication -p PASSWORD, --password=PASSWORD diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py index 4818814..73a0666 100644 --- a/pylinkvalidator/crawler.py +++ b/pylinkvalidator/crawler.py @@ -535,8 +535,6 @@ def _get_links(self, elements, attribute, base_url_split, if not is_supported_scheme( abs_url_split, self.worker_config.ignore_bad_tel_urls): - print("CONTINUE") - print(abs_url_split) continue link = Link( diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 84bfd5e..228ef11 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -469,7 +469,7 @@ def _build_parser(self): "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls", action="store_true", default=False, help="ignore badly formed tel URLs missing the leading + sign, " - "e.g., tel:1234567890") + "e.g., tel:1234567890 - only necessary for Python > 2.6") crawler_group.add_option( "-u", "--username", dest="username", action="store", default=None, diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index 12341c6..bab176b 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -486,8 +486,9 @@ def test_bad_tel_link(self): self.assertEqual(2, len(site.pages)) self.assertEqual(0, len(site.error_pages)) - site = self._run_crawler_plain( - ThreadSiteCrawler, [], "/badtel.html") - # root + one page + one bad tel link. One correct tel link ignored - self.assertEqual(3, len(site.pages)) - self.assertEqual(1, len(site.error_pages)) + if sys.version_info[:2] > (2, 6): + site = self._run_crawler_plain( + ThreadSiteCrawler, [], "/badtel.html") + # root + one page + one bad tel link. One correct tel link ignored + self.assertEqual(3, len(site.pages)) + self.assertEqual(1, len(site.error_pages)) From 8fc33d503c470c7e79c6e260f86f6fa2ec0ffbb1 Mon Sep 17 00:00:00 2001 From: Muhammad Alif Akbar Date: Wed, 21 Mar 2018 18:30:45 +0700 Subject: [PATCH 3/4] handle space in path or url --- pylinkvalidator/tests.py | 3 +++ pylinkvalidator/urlutil.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index bab176b..9004865 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -128,6 +128,9 @@ def test_clean_url_split(self): self.assertEqual( "http://www.example.com/", get_clean_url_split("http://www.example.com/").geturl()) + self.assertEqual( + "http://www.example.com/media%20gallery", + get_clean_url_split("http://www.example.com/media gallery").geturl()) def test_get_absolute_url(self): base_url_split = get_clean_url_split( diff --git a/pylinkvalidator/urlutil.py b/pylinkvalidator/urlutil.py index 1ed1e0a..8590c0e 100644 --- a/pylinkvalidator/urlutil.py +++ b/pylinkvalidator/urlutil.py @@ -80,7 +80,7 @@ def url_encode_non_ascii(url_part): return re.sub( b'[\x80-\xFF]', lambda match: quote(match.group(0)).encode("utf-8"), - url_part.encode("utf-8")).decode("ascii") + url_part.encode("utf-8")).decode("ascii").replace(' ', '%20') # handle space char in query def get_absolute_url_split(url, base_url_split): From 3a6943d563390f65c02ad8271eac863a8d82bf4e Mon Sep 17 00:00:00 2001 From: Muhammad Alif Akbar Date: Thu, 22 Mar 2018 11:26:43 +0700 Subject: [PATCH 4/4] remove --use-mirrors --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 56a1922..0a87815 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,6 @@ python: - "2.7" - "3.4" install: - - "pip install . --use-mirrors" + - "pip install ." script: nosetests sudo: false