diff --git a/README.rst b/README.rst index 12965f8..5d1b776 100644 --- a/README.rst +++ b/README.rst @@ -102,6 +102,9 @@ usage examples. -H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS Comma-separated list of additional hosts to crawl (e.g., example.com,subdomain.another.com) + -x EXCLUDED_URLS, --exclude=EXCLUDED_URLS + URLs matching the regular expression will be ignored + (e.g., /private/ ) -i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES Comma-separated list of host/path prefixes to ignore (e.g., www.example.com/ignore_this_and_after/) diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py index 54e33a9..db32dd5 100644 --- a/pylinkvalidator/models.py +++ b/pylinkvalidator/models.py @@ -12,7 +12,7 @@ from optparse import OptionParser, OptionGroup from pylinkvalidator.compat import get_safe_str -from pylinkvalidator.urlutil import get_clean_url_split +from pylinkvalidator.urlutil import get_clean_url_split, re DEFAULT_TYPES = ['a', 'img', 'script', 'link'] @@ -148,6 +148,7 @@ def __init__(self): self.worker_config = None self.accepted_hosts = [] self.ignored_prefixes = [] + self.excluded_urls = [] self.worker_size = 0 def should_crawl(self, url_split, depth): @@ -160,8 +161,10 @@ def is_local(self, url_split): return url_split.netloc in self.accepted_hosts def should_download(self, url_split): - """Returns True if the url does not start with an ignored prefix and if - it is local or outside links are allowed.""" + """Returns True if the url does not start with + * an ignored prefix + * it does not match excluded url regex + * if it is local or outside links are allowed.""" local = self.is_local(url_split) if not self.options.test_outside and not local: @@ -169,6 +172,10 @@ def should_download(self, url_split): url = url_split.geturl() + for exclude_url in self.excluded_urls: + if exclude_url.search(url): + return False + for ignored_prefix in self.ignored_prefixes: if url.startswith(ignored_prefix): return False @@ -207,6 +214,10 @@ def _parse_config(self): if self.options.ignored_prefixes: self.ignored_prefixes = self.options.ignored_prefixes.split(',') + if self.options.excluded_urls: + self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')] + + if self.options.workers: self.worker_size = self.options.workers else: @@ -274,6 +285,11 @@ def _build_parser(self): dest="accepted_hosts", action="store", default=None, help="comma-separated list of additional hosts to crawl (e.g., " "example.com,subdomain.another.com)") + crawler_group.add_option( + "-x", "--exclude", dest="excluded_urls", + action="store", default=None, + help="URLs matching the regular expression will be ignored" + "(e.g., /private/)") crawler_group.add_option( "-i", "--ignore", dest="ignored_prefixes", action="store", default=None, diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py index 945d3f0..6435596 100644 --- a/pylinkvalidator/tests.py +++ b/pylinkvalidator/tests.py @@ -331,6 +331,13 @@ def test_run_once(self): self.assertEqual(8, len(site.pages)) self.assertEqual(0, len(site.error_pages)) + def test_exclude(self): + site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"]) + + # exclude /sub/ directory = 4 pages linked on the index + self.assertEqual(4, len(site.pages)) + self.assertEqual(0, len(site.error_pages)) + def test_depth_0(self): site = self._run_crawler_plain( ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")