forked from petterw/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
68 lines (59 loc) · 2.23 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
An example invocation of the crawler
"""
import sys, getopt, time
from datetime import timedelta
from crawler import Crawler, resume
import cstats
def crawl(c = None, seed = []):
if c == None:
c = Crawler(
seed = seed, # your seed urls here
default_crawl_delay = 20,
obey_robots_txt = True,
document_fetchers = 15,
robots_txt_fetchers = 5) #start at least this many celery workers
try:
# start crawling, with this tasks specific termination criteria and
# a save period of 20 seconds
c.crawl(
termination_checker = example_task_termination_checker,
save_frequency = timedelta(seconds = 20))
finally:
# if we were killed or finished, suspend crawl state to file.
# revive the crawl with resume from crawler.py to explore results
print "\nSuspended crawl to " + c.suspend()
# print some statistics
print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
print "Discovered links: " + str(cstats.discovered_links(c))
print "Discovered domains: " + str(cstats.discovered_domains(c))
print "Runtime: " + str(cstats.runtime(c)) + " seconds"
maxref = cstats.most_prolific_referer(c)
# utf-8 printing problem in domain?
print "Most prolific referrer was " + maxref["name"] + " with an average of " + str(maxref["avg_links_per_page"]) + " outgoing links per page."+"\n"
def example_task_termination_checker(crawler):
""" Checks for the specific termination critera for this task
Note that it is only checked once per pass of the
crawl management loop, so exceeding the termination critera
by some small number of items discovered is expected.
"""
return cstats.discovered_links(crawler) >= 10000 or cstats.discovered_domains(crawler) >= 100
def main(argv=None):
if argv is None:
argv = sys.argv
if len(argv) >= 3:
if len(argv) == 3:
#resume crawl
if argv[1] == "--resume" and argv[2].find("suspended_crawl") > -1:
print "Resuming crawl from " + argv[2]
c = resume(argv[2])
crawl(c)
if argv[1] == "--seed":
#start new crawl
print "Starting new crawl with " + str(argv[2:])
crawl(c = None, seed = argv[2:])
else:
#help
print "Invoke this script with either --seed url1 url2 ... or --resume a_suspended_crawl_file"
if __name__ == "__main__":
sys.exit(main())