From 78484a0a6559ee5343efdd2cc48334186a257234 Mon Sep 17 00:00:00 2001
From: Barthelemy Dagenais <barthelemy@infobart.com>
Date: Sun, 22 Jan 2017 20:03:50 -0500
Subject: [PATCH 1/4] refs #16 - Added --ignore-bad-tel-urls.

---
 README.rst                            |  3 +++
 pylinkvalidator/crawler.py            |  7 +++++--
 pylinkvalidator/models.py             | 14 +++++++++++---
 pylinkvalidator/testfiles/badtel.html |  8 ++++++++
 pylinkvalidator/tests.py              | 13 +++++++++++++
 pylinkvalidator/urlutil.py            | 19 +++++++++++++++++++
 6 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 pylinkvalidator/testfiles/badtel.html

diff --git a/README.rst b/README.rst
index 309fdbd..a4635c9 100644
--- a/README.rst
+++ b/README.rst
@@ -105,6 +105,9 @@ usage examples.
       -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
                           Comma-separated list of host/path prefixes to ignore
                           (e.g., www.example.com/ignore_this_and_after/)
+      -b, --ignore-bad-tel-urls
+                          ignore badly formed tel URLs missing the leading +
+                          sign, e.g., tel:1234567890
       -u USERNAME, --username=USERNAME
                           Username to use with basic HTTP authentication
       -p PASSWORD, --password=PASSWORD
diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py
index ddd776f..4818814 100644
--- a/pylinkvalidator/crawler.py
+++ b/pylinkvalidator/crawler.py
@@ -25,7 +25,7 @@
 from pylinkvalidator.reporter import report
 from pylinkvalidator.urlutil import (
     get_clean_url_split, get_absolute_url_split,
-    is_link, SUPPORTED_SCHEMES, is_similar_url_split)
+    is_link, is_similar_url_split, is_supported_scheme)
 
 
 WORK_DONE = '__WORK_DONE__'
@@ -533,7 +533,10 @@ def _get_links(self, elements, attribute, base_url_split,
                     continue
                 abs_url_split = get_absolute_url_split(url, base_url_split)
 
-                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
+                if not is_supported_scheme(
+                        abs_url_split, self.worker_config.ignore_bad_tel_urls):
+                    print("CONTINUE")
+                    print(abs_url_split)
                     continue
 
                 link = Link(
diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
index bb54b50..84bfd5e 100644
--- a/pylinkvalidator/models.py
+++ b/pylinkvalidator/models.py
@@ -14,7 +14,9 @@
 
 from pylinkvalidator.included.bs4 import BeautifulSoup
 from pylinkvalidator.compat import get_safe_str
-from pylinkvalidator.urlutil import get_clean_url_split, get_absolute_url_split
+from pylinkvalidator.urlutil import (
+    get_clean_url_split, get_absolute_url_split)
+
 
 PREFIX_ALL = "*"
 
@@ -112,7 +114,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 WorkerConfig = namedtuple_with_defaults(
     "WorkerConfig",
     ["username", "password", "types", "timeout", "parser", "strict_mode",
-     "prefer_server_encoding", "extra_headers"])
+     "prefer_server_encoding", "extra_headers", "ignore_bad_tel_urls"])
 
 
 WorkerInput = namedtuple_with_defaults(
@@ -307,7 +309,8 @@ def _build_worker_config(self, options):
         return WorkerConfig(
             options.username, options.password, types, options.timeout,
             options.parser, options.strict_mode,
-            options.prefer_server_encoding, headers)
+            options.prefer_server_encoding, headers,
+            options.ignore_bad_tel_urls)
 
     def _build_accepted_hosts(self, options, start_urls):
         if options.multi:
@@ -462,6 +465,11 @@ def _build_parser(self):
             action="store", default=None,
             help="comma-separated list of host/path prefixes to ignore "
             "(e.g., www.example.com/ignore_this_and_after/)")
+        crawler_group.add_option(
+            "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls",
+            action="store_true", default=False,
+            help="ignore badly formed tel URLs missing the leading + sign, "
+            "e.g., tel:1234567890")
         crawler_group.add_option(
             "-u", "--username", dest="username",
             action="store", default=None,
diff --git a/pylinkvalidator/testfiles/badtel.html b/pylinkvalidator/testfiles/badtel.html
new file mode 100644
index 0000000..a7ab22e
--- /dev/null
+++ b/pylinkvalidator/testfiles/badtel.html
@@ -0,0 +1,8 @@
+<html>
+    <body>
+        <p>Go to <a href="/a.html">next page</a>
+        <p>Go to <a href="tel:+1234567890">good tel link</a>
+        <p>Go to <a href="tel:1234567890">bad tel link</a>
+        </p>
+    </body>
+</html>
diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
index 8630a11..12341c6 100644
--- a/pylinkvalidator/tests.py
+++ b/pylinkvalidator/tests.py
@@ -478,3 +478,16 @@ def test_unicode(self):
         # 3 pages linked on the root (root, 0, 0b)
         self.assertEqual(2, len(site.pages))
         self.assertEqual(0, len(site.error_pages))
+
+    def test_bad_tel_link(self):
+        site = self._run_crawler_plain(
+            ThreadSiteCrawler, ["--ignore-bad-tel-urls"], "/badtel.html")
+        # root + one page linked. bad tel link and tel link are ignored.
+        self.assertEqual(2, len(site.pages))
+        self.assertEqual(0, len(site.error_pages))
+
+        site = self._run_crawler_plain(
+            ThreadSiteCrawler, [], "/badtel.html")
+        # root + one page + one bad tel link. One correct tel link ignored
+        self.assertEqual(3, len(site.pages))
+        self.assertEqual(1, len(site.error_pages))
diff --git a/pylinkvalidator/urlutil.py b/pylinkvalidator/urlutil.py
index b85b9c5..1ed1e0a 100644
--- a/pylinkvalidator/urlutil.py
+++ b/pylinkvalidator/urlutil.py
@@ -107,3 +107,22 @@ def is_similar_url_split(url_split_1, url_split_2):
     else:
         return url_split_1.path == url_split_2.path and\
             url_split_1.netloc == url_split_2.netloc
+
+
+def is_bad_tel_url_split(url_split):
+    """Returns True if the URL is using a badly formed tel scheme
+    that is not detected by Python urlparse.
+    """
+    return url_split.netloc.startswith("tel:") or\
+        url_split.path.startswith("/tel:")
+
+
+def is_supported_scheme(url_split, ignore_bad_tel_urls=False):
+    """Returns True if the URL has a supported scheme and can be crawled.
+    """
+    if url_split.scheme not in SUPPORTED_SCHEMES:
+        return False
+    elif ignore_bad_tel_urls and is_bad_tel_url_split(url_split):
+        # issue #16
+        return False
+    return True

From 7836af0daaccf41f681f1824b33d35b4b931b248 Mon Sep 17 00:00:00 2001
From: Barthelemy Dagenais <barthelemy@infobart.com>
Date: Sun, 22 Jan 2017 20:12:02 -0500
Subject: [PATCH 2/4] refs #16 - fix test to be compatible with Python 2.6

---
 README.rst                 |  3 ++-
 pylinkvalidator/crawler.py |  2 --
 pylinkvalidator/models.py  |  2 +-
 pylinkvalidator/tests.py   | 11 ++++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.rst b/README.rst
index a4635c9..c7a9730 100644
--- a/README.rst
+++ b/README.rst
@@ -107,7 +107,8 @@ usage examples.
                           (e.g., www.example.com/ignore_this_and_after/)
       -b, --ignore-bad-tel-urls
                           ignore badly formed tel URLs missing the leading +
-                          sign, e.g., tel:1234567890
+                          sign, e.g., tel:1234567890 - only necessary for Python
+                          > 2.6
       -u USERNAME, --username=USERNAME
                           Username to use with basic HTTP authentication
       -p PASSWORD, --password=PASSWORD
diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py
index 4818814..73a0666 100644
--- a/pylinkvalidator/crawler.py
+++ b/pylinkvalidator/crawler.py
@@ -535,8 +535,6 @@ def _get_links(self, elements, attribute, base_url_split,
 
                 if not is_supported_scheme(
                         abs_url_split, self.worker_config.ignore_bad_tel_urls):
-                    print("CONTINUE")
-                    print(abs_url_split)
                     continue
 
                 link = Link(
diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
index 84bfd5e..228ef11 100644
--- a/pylinkvalidator/models.py
+++ b/pylinkvalidator/models.py
@@ -469,7 +469,7 @@ def _build_parser(self):
             "-b", "--ignore-bad-tel-urls", dest="ignore_bad_tel_urls",
             action="store_true", default=False,
             help="ignore badly formed tel URLs missing the leading + sign, "
-            "e.g., tel:1234567890")
+            "e.g., tel:1234567890 - only necessary for Python > 2.6")
         crawler_group.add_option(
             "-u", "--username", dest="username",
             action="store", default=None,
diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
index 12341c6..bab176b 100644
--- a/pylinkvalidator/tests.py
+++ b/pylinkvalidator/tests.py
@@ -486,8 +486,9 @@ def test_bad_tel_link(self):
         self.assertEqual(2, len(site.pages))
         self.assertEqual(0, len(site.error_pages))
 
-        site = self._run_crawler_plain(
-            ThreadSiteCrawler, [], "/badtel.html")
-        # root + one page + one bad tel link. One correct tel link ignored
-        self.assertEqual(3, len(site.pages))
-        self.assertEqual(1, len(site.error_pages))
+        if sys.version_info[:2] > (2, 6):
+            site = self._run_crawler_plain(
+                ThreadSiteCrawler, [], "/badtel.html")
+            # root + one page + one bad tel link. One correct tel link ignored
+            self.assertEqual(3, len(site.pages))
+            self.assertEqual(1, len(site.error_pages))

From 8fc33d503c470c7e79c6e260f86f6fa2ec0ffbb1 Mon Sep 17 00:00:00 2001
From: Muhammad Alif Akbar <alif.thetricky@gmail.com>
Date: Wed, 21 Mar 2018 18:30:45 +0700
Subject: [PATCH 3/4] handle space in path or url

---
 pylinkvalidator/tests.py   | 3 +++
 pylinkvalidator/urlutil.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
index bab176b..9004865 100644
--- a/pylinkvalidator/tests.py
+++ b/pylinkvalidator/tests.py
@@ -128,6 +128,9 @@ def test_clean_url_split(self):
         self.assertEqual(
             "http://www.example.com/",
             get_clean_url_split("http://www.example.com/").geturl())
+        self.assertEqual(
+            "http://www.example.com/media%20gallery",
+            get_clean_url_split("http://www.example.com/media gallery").geturl())
 
     def test_get_absolute_url(self):
         base_url_split = get_clean_url_split(
diff --git a/pylinkvalidator/urlutil.py b/pylinkvalidator/urlutil.py
index 1ed1e0a..8590c0e 100644
--- a/pylinkvalidator/urlutil.py
+++ b/pylinkvalidator/urlutil.py
@@ -80,7 +80,7 @@ def url_encode_non_ascii(url_part):
     return re.sub(
         b'[\x80-\xFF]',
         lambda match: quote(match.group(0)).encode("utf-8"),
-        url_part.encode("utf-8")).decode("ascii")
+        url_part.encode("utf-8")).decode("ascii").replace(' ', '%20')  # handle space char in query
 
 
 def get_absolute_url_split(url, base_url_split):

From 3a6943d563390f65c02ad8271eac863a8d82bf4e Mon Sep 17 00:00:00 2001
From: Muhammad Alif Akbar <alif.thetricky@gmail.com>
Date: Thu, 22 Mar 2018 11:26:43 +0700
Subject: [PATCH 4/4] remove --use-mirrors

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 56a1922..0a87815 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,6 @@ python:
   - "2.7"
   - "3.4"
 install:
-  - "pip install . --use-mirrors"
+  - "pip install ."
 script: nosetests
 sudo: false