Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add excluded_urls #9

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ usage examples.
-H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
Comma-separated list of additional hosts to crawl
(e.g., example.com,subdomain.another.com)
-x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
URLs matching the regular expression will be ignored
(e.g., /private/ )
-i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
Comma-separated list of host/path prefixes to ignore
(e.g., www.example.com/ignore_this_and_after/)
Expand Down
22 changes: 19 additions & 3 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import OptionParser, OptionGroup

from pylinkvalidator.compat import get_safe_str
from pylinkvalidator.urlutil import get_clean_url_split
from pylinkvalidator.urlutil import get_clean_url_split, re


DEFAULT_TYPES = ['a', 'img', 'script', 'link']
Expand Down Expand Up @@ -148,6 +148,7 @@ def __init__(self):
self.worker_config = None
self.accepted_hosts = []
self.ignored_prefixes = []
self.excluded_urls = []
self.worker_size = 0

def should_crawl(self, url_split, depth):
Expand All @@ -160,15 +161,21 @@ def is_local(self, url_split):
return url_split.netloc in self.accepted_hosts

def should_download(self, url_split):
"""Returns True if the url does not start with an ignored prefix and if
it is local or outside links are allowed."""
"""Returns True if the url does not start with
* an ignored prefix
* it does not match excluded url regex
* if it is local or outside links are allowed."""
local = self.is_local(url_split)

if not self.options.test_outside and not local:
return False

url = url_split.geturl()

for exclude_url in self.excluded_urls:
if exclude_url.search(url):
return False

for ignored_prefix in self.ignored_prefixes:
if url.startswith(ignored_prefix):
return False
Expand Down Expand Up @@ -207,6 +214,10 @@ def _parse_config(self):
if self.options.ignored_prefixes:
self.ignored_prefixes = self.options.ignored_prefixes.split(',')

if self.options.excluded_urls:
self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]


if self.options.workers:
self.worker_size = self.options.workers
else:
Expand Down Expand Up @@ -274,6 +285,11 @@ def _build_parser(self):
dest="accepted_hosts", action="store", default=None,
help="comma-separated list of additional hosts to crawl (e.g., "
"example.com,subdomain.another.com)")
crawler_group.add_option(
"-x", "--exclude", dest="excluded_urls",
action="store", default=None,
help="URLs matching the regular expression will be ignored"
"(e.g., /private/)")
crawler_group.add_option(
"-i", "--ignore", dest="ignored_prefixes",
action="store", default=None,
Expand Down
7 changes: 7 additions & 0 deletions pylinkvalidator/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,13 @@ def test_run_once(self):
self.assertEqual(8, len(site.pages))
self.assertEqual(0, len(site.error_pages))

def test_exclude(self):
site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"])

# exclude /sub/ directory = 4 pages linked on the index
self.assertEqual(4, len(site.pages))
self.assertEqual(0, len(site.error_pages))

def test_depth_0(self):
site = self._run_crawler_plain(
ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")
Expand Down