Add excluded_urls option bartdag#7

jimpriest · Sep 1, 2015 · 2e7f03b · 2e7f03b
1 parent 6007083
commit 2e7f03b
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 6 deletions.
diff --git a/README.rst b/README.rst
@@ -102,6 +102,9 @@ usage examples.
       -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
                           Comma-separated list of additional hosts to crawl
                           (e.g., example.com,subdomain.another.com)
+      -x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
+                          URLs matching the regular expression will be ignored
+                          (e.g., /private/ )
       -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
                           Comma-separated list of host/path prefixes to ignore
                           (e.g., www.example.com/ignore_this_and_after/)

diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
@@ -12,7 +12,7 @@
 from optparse import OptionParser, OptionGroup
 
 from pylinkvalidator.compat import get_safe_str
-from pylinkvalidator.urlutil import get_clean_url_split
+from pylinkvalidator.urlutil import get_clean_url_split, re
 
 
 DEFAULT_TYPES = ['a', 'img', 'script', 'link']
@@ -150,6 +150,7 @@ def __init__(self):
         self.worker_config = None
         self.accepted_hosts = []
         self.ignored_prefixes = []
+        self.excluded_urls = []
         self.worker_size = 0
 
     def should_crawl(self, url_split, depth):
@@ -162,21 +163,28 @@ def is_local(self, url_split):
         return url_split.netloc in self.accepted_hosts
 
     def should_download(self, url_split):
-        """Returns True if the url does not start with an ignored prefix and if
-        it is local or outside links are allowed."""
+        """Returns True if the url does not start with 
+          * an ignored prefix
+          * it does not match excluded url regex
+          * if it is local or outside links are allowed."""
         local = self.is_local(url_split)
 
         if not self.options.test_outside and not local:
             return False
 
         url = url_split.geturl()
 
+        for exclude_url in self.excluded_urls:
+            if re.search(exclude_url, url):
+                return False
+
         for ignored_prefix in self.ignored_prefixes:
             if url.startswith(ignored_prefix):
                 return False
 
         return True
 
+
     def parse_cli_config(self):
         """Builds the options and args based on the command line options."""
         (self.options, self.start_urls) = self.parser.parse_args()
@@ -209,6 +217,9 @@ def _parse_config(self):
         if self.options.ignored_prefixes:
             self.ignored_prefixes = self.options.ignored_prefixes.split(',')
 
+        if self.options.excluded_urls:
+            self.excluded_urls = self.options.excluded_urls.split(',')
+
         if self.options.workers:
             self.worker_size = self.options.workers
         else:
@@ -276,6 +287,11 @@ def _build_parser(self):
             dest="accepted_hosts",  action="store", default=None,
             help="comma-separated list of additional hosts to crawl (e.g., "
             "example.com,subdomain.another.com)")
+        crawler_group.add_option(
+            "-x", "--exclude", dest="excluded_urls",
+            action="store", default=None,
+            help="URLs matching the regular expression will be ignored"
+            "(e.g., /private/)")
         crawler_group.add_option(
             "-i", "--ignore", dest="ignored_prefixes",
             action="store", default=None,
@@ -294,8 +310,6 @@ def _build_parser(self):
             dest="headers",  action="append", metavar="HEADER",
             help="custom header of the form Header: Value "
             "(repeat for multiple headers)")
-        # crawler_group.add_option("-U", "--unique", dest="unique",
-        #         action="store_true", default=False)
         crawler_group.add_option(
             "-t", "--types", dest="types", action="store",
             default=",".join(DEFAULT_TYPES),
@@ -325,7 +339,6 @@ def _build_parser(self):
             "-e", "--prefer-server-encoding", dest="prefer_server_encoding",
             action="store_true", default=False,
             help="Prefer server encoding if specified. Else detect encoding")
-        # TODO Add follow redirect option.
 
         parser.add_option_group(crawler_group)