bartdag · jimpriest · Aug 17, 2015 · Aug 19, 2015 · Sep 1, 2015 · Sep 2, 2015
diff --git a/README.rst b/README.rst
@@ -102,6 +102,9 @@ usage examples.
       -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
                           Comma-separated list of additional hosts to crawl
                           (e.g., example.com,subdomain.another.com)
+      -x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
+                          URLs matching the regular expression will be ignored
+                          (e.g., /private/ )
       -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
                           Comma-separated list of host/path prefixes to ignore
                           (e.g., www.example.com/ignore_this_and_after/)

diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
@@ -12,7 +12,7 @@
 from optparse import OptionParser, OptionGroup
 
 from pylinkvalidator.compat import get_safe_str
-from pylinkvalidator.urlutil import get_clean_url_split
+from pylinkvalidator.urlutil import get_clean_url_split, re
 
 
 DEFAULT_TYPES = ['a', 'img', 'script', 'link']
@@ -148,6 +148,7 @@ def __init__(self):
         self.worker_config = None
         self.accepted_hosts = []
         self.ignored_prefixes = []
+        self.excluded_urls = []
         self.worker_size = 0
 
     def should_crawl(self, url_split, depth):
@@ -160,15 +161,21 @@ def is_local(self, url_split):
         return url_split.netloc in self.accepted_hosts
 
     def should_download(self, url_split):
-        """Returns True if the url does not start with an ignored prefix and if
-        it is local or outside links are allowed."""
+        """Returns True if the url does not start with 
+          * an ignored prefix
+          * it does not match excluded url regex
+          * if it is local or outside links are allowed."""
         local = self.is_local(url_split)
 
         if not self.options.test_outside and not local:
             return False
 
         url = url_split.geturl()
 
+        for exclude_url in self.excluded_urls:
+            if exclude_url.search(url):
+                return False
+
         for ignored_prefix in self.ignored_prefixes:
             if url.startswith(ignored_prefix):
                 return False
@@ -207,6 +214,10 @@ def _parse_config(self):
         if self.options.ignored_prefixes:
             self.ignored_prefixes = self.options.ignored_prefixes.split(',')
 
+        if self.options.excluded_urls:
+            self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]
+
+
         if self.options.workers:
             self.worker_size = self.options.workers
         else:
@@ -274,6 +285,11 @@ def _build_parser(self):
             dest="accepted_hosts",  action="store", default=None,
             help="comma-separated list of additional hosts to crawl (e.g., "
             "example.com,subdomain.another.com)")
+        crawler_group.add_option(
+            "-x", "--exclude", dest="excluded_urls",
+            action="store", default=None,
+            help="URLs matching the regular expression will be ignored"
+            "(e.g., /private/)")
         crawler_group.add_option(
             "-i", "--ignore", dest="ignored_prefixes",
             action="store", default=None,

diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
@@ -331,6 +331,13 @@ def test_run_once(self):
         self.assertEqual(8, len(site.pages))
         self.assertEqual(0, len(site.error_pages))
 
+    def test_exclude(self):
+        site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"])
+
+        # exclude /sub/ directory = 4 pages linked on the index
+        self.assertEqual(4, len(site.pages))
+        self.assertEqual(0, len(site.error_pages))
+
     def test_depth_0(self):
         site = self._run_crawler_plain(
             ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")