-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.rb
executable file
·137 lines (123 loc) · 3.78 KB
/
crawl.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env ruby
# frozen_string_literal: true
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'concurrent-ruby'
gem 'mechanize'
end
class Spider
def initialize(nest, id)
@id = id
@nest = nest
@queue = Queue.new
Thread.start do
url = nil
referer = nil
agent = Mechanize.new
puts format('%s %s', Time.now.strftime("%T"), "[#{id}] READY")
@nest.spider_ready(self, working: false)
loop do
level, url, referer = @queue.pop
start_time = Time.now
page = agent.get(url)
duration = Time.now - start_time
puts "Slow: #{duration.round(3)}s #{url}" if duration > 2
@nest.visited[url] = duration
@nest.visited[page.uri.path] = duration
if page.is_a? Mechanize::Page
page.links_with(href: /../).each do |link|
unless link.attributes['data-method']
href = link.href
unless href =~ %r{^(http|/)}
href = "#{page.uri.path}#{'/' unless page.uri.path =~%r{/$} || href =~ /^[#?]/}#{href}"
end
if href !~ /^http/ || URI(href).host == page.uri.host
href.prepend "/#{$1}" if href !~ %r{^/\d+/} && page.uri.path =~ %r{^(?:https?://[^/]+)?/(\d+)/}
@nest.enqueue(href, level + 1, page.uri.to_s)
elsif @nest.ignored.add?(href)
puts "Ignore external link: #{href}"
end
end
end
else
puts "Found file."
end
@nest.spider_ready(self)
end
rescue StandardError => e
puts
abort format('%-10s: %s', 'Error', "#{url.inspect}, #{e}, referer: #{referer}")
end
end
def scan_url(level, url, referer)
@queue.push([level, url, referer])
end
end
class SpiderNest
attr_reader :ignored, :visited
def self.scan(...)
new(...).scan
end
def initialize(root:, max_urls:)
@spiders = Queue.new
@workers = Concurrent::Set.new
@spider_count = [(Etc.nprocessors * 2), max_urls].min
@spider_count.times { |i| Spider.new(self, i + 1) }
@max_urls = max_urls
@start_time = Time.now
@urls = Queue.new
@found = Concurrent::Set.new
@visited = Concurrent::Map.new
@ignored = Concurrent::Set.new
enqueue(root, 0, nil)
end
def spider_ready(spider, working: true)
if working
@workers.delete?(spider) or abort("Spider not working")
end
@spiders.push(spider)
end
def enqueue(url, level, referer)
url = url.sub(/[?&]lang=(iw|sv)/, '')
return if @found.size >= @max_urls
return if url =~ %r{/logout$|^/test}
if url =~ /^http/
uri = URI(url)
path = uri.path
if uri.query
path+=uri.query
end
else
path = url
end
return unless @found.add?(path)
@urls << [url, level, referer]
end
def scan
i = 0
while !@urls.empty? || i < @found.size || @workers.any?
begin
url, level, referer = @urls.pop(true)
i += 1
puts format('%s %-10s: %s', Time.now.strftime("%T"), "Visit [#{i}/#{@found.size}] (#{level})", url.inspect)
spider = @spiders.pop
@workers.add?(spider) or abort("Spider already working.")
spider.scan_url(level, url, referer)
rescue ThreadError
puts 'Wait for next URL...' if @workers.empty?
sleep 0.1
end
end
@spider_count.times{@spiders.pop} # Wait for the spider to complete cleanup.
@visited.size
end
end
if __FILE__ == $PROGRAM_NAME
start = Time.now
$stdout.sync = true
count = SpiderNest.scan(root: 'http://localhost:8080/', max_urls: 2000)
duration = Time.now - start
puts "Total time: #{duration.round}s. #{count} pages. #{(duration / count).round(3)}s/page #{(count / duration).round(3)}pages/s"
abort('Found no links.') if count < 2
end