-
Notifications
You must be signed in to change notification settings - Fork 10
/
associations.rb
executable file
·98 lines (84 loc) · 2.5 KB
/
associations.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'yaml'
require 'net/https'
require 'uri'
require 'open-uri'
MAX_RETRIES = 0
ASSOCIATIONS_FILE = 'associations.yml'
def get_redirect(uri)
url = URI.parse(uri)
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
res = http.start {|http|
http.head(url.path)
}
res['location']
end
class Not200Error < StandardError
end
def is_200?(uri)
retries = 0
begin
url = URI.parse(uri)
res = Net::HTTP.start(url.host, url.port) {|http|
http.head(url.path)
}
$stderr.puts "#{res.code} for #{uri}"
if res.code == '200'
return true
elsif retries < MAX_RETRIES
raise Not200Error
else
return false
end
rescue Not200Error
retries += 1
sleep(retries)
retry
end
end
doc = Nokogiri::HTML(open('http://www.edonnelly.com/loebs.html'))
associations = {}
if File.exist?(ASSOCIATIONS_FILE)
associations = YAML.load(File.read(ASSOCIATIONS_FILE))
end
doc.xpath('//td/a[contains(@href,"www.hup.harvard.edu/catalog/") and (text() != "HUP")]').to_a.uniq.each do |loeb|
$stderr.puts "Checking:\n#{loeb.to_s}"
title = loeb.xpath('following::td[1]').first.content
original_title = loeb.xpath('following::td[1]/following::i[1]').first.content
author = title.split(' -- ').first
if author =~ /,/
title = original_title
else
title = author + ' -- ' + original_title
end
loeb_number = loeb.content
if is_200?("http://ryanfb.github.io/loebolus-data/#{loeb_number}.pdf")
unless associations.has_key? loeb_number
associations[loeb_number] ||= {}
associations[loeb_number]['title'] ||= title
end
archive = loeb.xpath('following::a[text()="Archive"]').first
google = loeb.xpath('following::a[text()="Google"]').first
$stderr.puts "Got archive: #{archive.to_s}"
$stderr.puts "Got google: #{google.to_s}"
if archive && archive['href'] =~ /www\.archive\.org\//
associations[loeb_number]['archive'] = archive['href']
id = archive['href'].split('/').last
openlibrary = get_redirect("https://openlibrary.org/ia/#{id}")
if openlibrary
$stderr.puts "Got openlibrary: #{openlibrary}"
associations[loeb_number]['openlibrary'] = openlibrary
end
end
if google && google['href'] =~ /books\.google\.com\//
associations[loeb_number]['google'] = google['href']
end
end
end
File.open(ASSOCIATIONS_FILE, 'w') do |file|
file.write associations.to_yaml
end