-
Notifications
You must be signed in to change notification settings - Fork 2
/
hathifile-dl.rb
executable file
·41 lines (32 loc) · 1.13 KB
/
hathifile-dl.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env ruby
# Downloads and assembles a fully updated Hathifile from: https://www.hathitrust.org/hathifiles
# Usage: ./hathifile-dl.rb > hathifile.tsv
require 'json'
require 'time'
require 'uri'
require 'net/http'
def download(hathifile)
unless File.exist?(hathifile['filename'])
$stderr.puts "Downloading: #{hathifile['filename']}"
`wget -c "#{hathifile['url']}"`
end
end
hathifiles = JSON.parse(Net::HTTP.get(URI('https://www.hathitrust.org/sites/www.hathitrust.org/files/hathifiles/hathi_file_list.json')))
latest_full_hathifile = hathifiles.select{|h| h['full']}.sort_by{|h| Time.parse(h['created'])}.last
$stderr.puts latest_full_hathifile.inspect
downloads = [latest_full_hathifile]
hathifiles.select{|h| !h['full']}.each do |hathifile|
if Time.parse(hathifile['created']) > Time.parse(latest_full_hathifile['created'])
$stderr.puts hathifile['created']
downloads << hathifile
end
end
downloads.sort_by!{|h| Time.parse(h['created'])}
downloads.each{|h| download(h)}
downloads.each do |hathifile|
Zlib::GzipReader.open(hathifile['filename']) do |gz|
gz.each_line do |line|
puts line
end
end
end