forked from vigneshv59/Apertium-bible.is-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
70 lines (53 loc) · 1.91 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
require 'nokogiri'
require 'open-uri'
require 'optparse'
require 'net/http'
options = {}
optparse = OptionParser.new do |opts|
opts.banner = "Usage: example.rb URL OUTPUT [options]"
opts.on("-tTIME", "--time=TIME", "The time limit in seconds. (Optional)") do |t|
options[:time] = t
end
opts.on("-nVERSES", "--num=VERSES", "The verse limit. (Optional)") do |v|
options[:verses] = v
end
end
options[:url] = ARGV[0]
options[:out] = ARGV[1]
begin
optparse.parse!
raise OptionParser::MissingArgument if options[:url].nil? || options[:out].nil?
rescue OptionParser::MissingArgument, OptionParser::InvalidOption
puts optparse
exit
end
bible_url = options[:url]
page_exists = true
verses_finished = 0
start_time = Time.now
distiction_type = options[:url].split("/")[-1]
open(options[:out] || "out.txt", "w:UTF-8") do |f|
while page_exists && (options[:time].nil? || Time.now - start_time < options[:time].to_i) && (options[:verses].nil? || verses_finished < options[:verses].to_i)
url = URI.parse(bible_url)
if url.path.split("/")[-1] != "N" && url.path.split("/")[-1] != "D"
url.path = File.join(url.path, distiction_type)
end
req = Net::HTTP::Get.new(url.path)
req.add_field("Cookie", {"current-bible-location" => URI.encode(url.path) })
res = Net::HTTP.new(url.host, url.port).start do |http|
http.request(req)
end
page = Nokogiri::HTML(res.body, nil, 'UTF-8')
a = page.css(".verse-container")
ch_title = page.xpath('.//*[@class="chapter-title"]').text
f.puts ch_title << "\n"
puts(ch_title)
a.each do |v|
f.puts v.xpath('.//*[@class="verse-marker"]').text << " " << v.xpath('.//*[@class="verse-text"]').text << "\n"
end
bible_url = page.at_css(".chapter-nav-right")["href"]
page_exists = false if bible_url.nil? || bible_url.empty?
verses_finished = verses_finished + 1
f.puts("\n")
end
end