Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add excluded_urls #9

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ usage examples.
-H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
Comma-separated list of additional hosts to crawl
(e.g., example.com,subdomain.another.com)
-x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
URLs matching the regular expression will be ignored
(e.g., /private/ )
-i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
Comma-separated list of host/path prefixes to ignore
(e.g., www.example.com/ignore_this_and_after/)
Expand Down
21 changes: 18 additions & 3 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import OptionParser, OptionGroup

from pylinkvalidator.compat import get_safe_str
from pylinkvalidator.urlutil import get_clean_url_split
from pylinkvalidator.urlutil import get_clean_url_split, re


DEFAULT_TYPES = ['a', 'img', 'script', 'link']
Expand Down Expand Up @@ -148,6 +148,7 @@ def __init__(self):
self.worker_config = None
self.accepted_hosts = []
self.ignored_prefixes = []
self.excluded_urls = []
self.worker_size = 0

def should_crawl(self, url_split, depth):
Expand All @@ -160,15 +161,21 @@ def is_local(self, url_split):
return url_split.netloc in self.accepted_hosts

def should_download(self, url_split):
"""Returns True if the url does not start with an ignored prefix and if
it is local or outside links are allowed."""
"""Returns True if the url does not start with
* an ignored prefix
* it does not match excluded url regex
* if it is local or outside links are allowed."""
local = self.is_local(url_split)

if not self.options.test_outside and not local:
return False

url = url_split.geturl()

for exclude_url in self.excluded_urls:
if re.search(exclude_url, url):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you go with the optimization noted below:

if exclude_url.search(url):

return False

for ignored_prefix in self.ignored_prefixes:
if url.startswith(ignored_prefix):
return False
Expand Down Expand Up @@ -207,6 +214,9 @@ def _parse_config(self):
if self.options.ignored_prefixes:
self.ignored_prefixes = self.options.ignored_prefixes.split(',')

if self.options.excluded_urls:
self.excluded_urls = self.options.excluded_urls.split(',')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

slight optimization to prevent compiling the regex for every url:

self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]


if self.options.workers:
self.worker_size = self.options.workers
else:
Expand Down Expand Up @@ -274,6 +284,11 @@ def _build_parser(self):
dest="accepted_hosts", action="store", default=None,
help="comma-separated list of additional hosts to crawl (e.g., "
"example.com,subdomain.another.com)")
crawler_group.add_option(
"-x", "--exclude", dest="excluded_urls",
action="store", default=None,
help="URLs matching the regular expression will be ignored"
"(e.g., /private/)")
crawler_group.add_option(
"-i", "--ignore", dest="ignored_prefixes",
action="store", default=None,
Expand Down