Skip to content

Commit

Permalink
Add excluded_urls option bartdag#7
Browse files Browse the repository at this point in the history
  • Loading branch information
Jim Priest committed Sep 1, 2015
1 parent 6007083 commit 2e7f03b
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ usage examples.
-H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
Comma-separated list of additional hosts to crawl
(e.g., example.com,subdomain.another.com)
-x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
URLs matching the regular expression will be ignored
(e.g., /private/ )
-i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
Comma-separated list of host/path prefixes to ignore
(e.g., www.example.com/ignore_this_and_after/)
Expand Down
25 changes: 19 additions & 6 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import OptionParser, OptionGroup

from pylinkvalidator.compat import get_safe_str
from pylinkvalidator.urlutil import get_clean_url_split
from pylinkvalidator.urlutil import get_clean_url_split, re


DEFAULT_TYPES = ['a', 'img', 'script', 'link']
Expand Down Expand Up @@ -150,6 +150,7 @@ def __init__(self):
self.worker_config = None
self.accepted_hosts = []
self.ignored_prefixes = []
self.excluded_urls = []
self.worker_size = 0

def should_crawl(self, url_split, depth):
Expand All @@ -162,21 +163,28 @@ def is_local(self, url_split):
return url_split.netloc in self.accepted_hosts

def should_download(self, url_split):
"""Returns True if the url does not start with an ignored prefix and if
it is local or outside links are allowed."""
"""Returns True if the url does not start with
* an ignored prefix
* it does not match excluded url regex
* if it is local or outside links are allowed."""
local = self.is_local(url_split)

if not self.options.test_outside and not local:
return False

url = url_split.geturl()

for exclude_url in self.excluded_urls:
if re.search(exclude_url, url):
return False

for ignored_prefix in self.ignored_prefixes:
if url.startswith(ignored_prefix):
return False

return True


def parse_cli_config(self):
"""Builds the options and args based on the command line options."""
(self.options, self.start_urls) = self.parser.parse_args()
Expand Down Expand Up @@ -209,6 +217,9 @@ def _parse_config(self):
if self.options.ignored_prefixes:
self.ignored_prefixes = self.options.ignored_prefixes.split(',')

if self.options.excluded_urls:
self.excluded_urls = self.options.excluded_urls.split(',')

if self.options.workers:
self.worker_size = self.options.workers
else:
Expand Down Expand Up @@ -276,6 +287,11 @@ def _build_parser(self):
dest="accepted_hosts", action="store", default=None,
help="comma-separated list of additional hosts to crawl (e.g., "
"example.com,subdomain.another.com)")
crawler_group.add_option(
"-x", "--exclude", dest="excluded_urls",
action="store", default=None,
help="URLs matching the regular expression will be ignored"
"(e.g., /private/)")
crawler_group.add_option(
"-i", "--ignore", dest="ignored_prefixes",
action="store", default=None,
Expand All @@ -294,8 +310,6 @@ def _build_parser(self):
dest="headers", action="append", metavar="HEADER",
help="custom header of the form Header: Value "
"(repeat for multiple headers)")
# crawler_group.add_option("-U", "--unique", dest="unique",
# action="store_true", default=False)
crawler_group.add_option(
"-t", "--types", dest="types", action="store",
default=",".join(DEFAULT_TYPES),
Expand Down Expand Up @@ -325,7 +339,6 @@ def _build_parser(self):
"-e", "--prefer-server-encoding", dest="prefer_server_encoding",
action="store_true", default=False,
help="Prefer server encoding if specified. Else detect encoding")
# TODO Add follow redirect option.

parser.add_option_group(crawler_group)

Expand Down

0 comments on commit 2e7f03b

Please sign in to comment.