-
Notifications
You must be signed in to change notification settings - Fork 3
/
Rakefile
197 lines (165 loc) · 7.18 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# frozen_string_literal: true
task default: %i[update write_layers_json]
require 'json'
require 'faraday'
require 'faraday/net_http_persistent'
require 'faraday/retry'
require 'uri'
require 'debug'
require 'progress_bar'
require 'progress_bar/core_ext/enumerable_with_progress'
require 'time'
CATALOG_URL = ENV.fetch('CATALOG_URL', 'https://earthworks.stanford.edu/catalog')
INSTITUTION = 'Stanford'
BASE_DIR = 'metadata-1.0'
IGNORED_FIELDS = %w[timestamp layer_availability_score_f _version_ hashed_id_ssi].freeze
DOC_ID_REGEX = /\Astanford-([b-df-hjkmnp-tv-z]{2})([0-9]{3})([b-df-hjkmnp-tv-z]{2})([0-9]{4})\z/i
# Wrap a function with a timestamp file to avoid re-processing documents
# If the block returns a timestamp, update the timestamp file
# rubocop:disable Metrics/AbcSize
def with_timestamp(timestamp_file = './last_run')
return unless block_given?
start_timestamp = Time.now.utc
# Check the timestamp file for the previous timestamp
puts "Unable to find timestamp file #{timestamp_file}" unless File.exist? timestamp_file
previous_timestamp = File.read(timestamp_file).strip
puts previous_timestamp.empty? ? "No timestamp found in #{timestamp_file}" : "Last run: #{previous_timestamp}"
# Call the provided block with the previous timestamp, receiving the most
# recent document's timestamp as the result of the block
last_timestamp = yield(previous_timestamp)
return unless last_timestamp
# If the most recent document is in the past (beyond any reasonable clock-skew)
# go ahead and bump the timestamp so we don't repeat documents
last_timestamp = (Time.parse(last_timestamp) + 1).utc.iso8601 if (Time.parse(last_timestamp) + 3600) < start_timestamp
puts "Updating last run: #{last_timestamp}"
File.open(timestamp_file, 'w') { |f| f.puts last_timestamp }
end
# rubocop:enable Metrics/AbcSize
# Solr parameters used to crawl the catalog for records
def catalog_search_params(modified_since: nil)
q_modified = modified_since.nil? ? '' : "layer_modified_dt:[#{modified_since} TO *]"
{
'f[dct_provenance_s][]': INSTITUTION,
format: 'json',
q: "#{q_modified} AND -dc_type_s:\"Interactive Resource\"",
per_page: 100,
sort: 'layer_modified_dt asc'
}
end
# Make a catalog request with given params and crawl through paginated results
# Returns an array of the results of calling the block on each data item
def crawl_catalog(params:, &block)
# Start with an empty list of results and an HTTP client
client = make_client
results = []
# While there are more pages, add the results of calling the block on each data item
response = get_json(CATALOG_URL, params:, client:)
while response&.key?('data') && response['data'].any?
results.concat(response['data'].map(&block))
next_page = response.dig('links', 'next')
response = next_page.nil? ? nil : get_json(next_page, params:, client:)
end
# Return the collected results
results
end
# Settings for retrying requests if the server rejects them
# See: https://github.com/lostisland/faraday-retry
def retry_options
{
max: 10,
interval: 1,
backoff_factor: 3,
exceptions: [Faraday::TimeoutError, Faraday::ConnectionFailed, Faraday::TooManyRequestsError]
}
end
# Persistent HTTP client with backoff used to make catalog requests
# pool_size controls parallelism
def make_client(pool_size: 1)
Faraday.new(CATALOG_URL) do |conn|
conn.request :retry, retry_options
conn.adapter(:net_http_persistent, pool_size:)
conn.response :raise_error
end
end
# Make an HTTP request and return parsed JSON
def get_json(url, params: {}, client: make_client)
JSON.parse(client.get(url, params).body.to_s)
end
# Check if a layer can be found in the catalog
def layer_exists?(doc_id, client: make_client)
client.head("#{CATALOG_URL}/#{doc_id}", format: :json).success?
rescue Faraday::ResourceNotFound
false
end
# Yield all documents from the catalog updated since the given timestamp
# Returns the result of calling the block on each document
def updated_docs_since(timestamp, &block)
# Traverse solr paginated responses, adding each document's URL to the list
urls = crawl_catalog(params: catalog_search_params(modified_since: timestamp)) { |doc| doc['links']['self'] }
puts urls.empty? ? '== No updated layers found ==' : "== Found #{urls.length} updated layers =="
# For each document URL, yield the parsed JSON
client = make_client(pool_size: 4)
urls.with_progress.map do |url|
block.call(get_json("#{url}/raw", params: { format: :json }, client:))
end
end
# Call block on all documents listed in layers.json that are no longer in the catalog
# Returns the result of calling the block on each document
def deleted_docs(&block)
# Get the list of document ids and their Earthworks URLs from layers.json
doc_ids = JSON.parse(File.read('layers.json')).keys.map { |x| x.sub('druid:', 'stanford-') }
# For each document id, check if it's still in the catalog. If not, yield its id
client = make_client(pool_size: 4)
doc_ids.with_progress.map do |doc_id|
block.call(doc_id) unless layer_exists?(doc_id, client:)
end
end
# Write the document's metadata to a file in the appropriate directory
# Returns the document's timestamp
def write_doc_metadata(doc)
# Find the nested directory structure for the document
tree_dirs = doc['layer_slug_s'].match(DOC_ID_REGEX).captures.join('/')
return if tree_dirs.empty?
# Create the directory structure if it doesn't exist
tree = File.expand_path("#{BASE_DIR}/#{tree_dirs}")
FileUtils.mkdir_p(tree)
# Strip out ignored fields and write the document to the directory
IGNORED_FIELDS.each { |field| doc.delete(field) }
File.open("#{tree}/geoblacklight.json", 'w') { |f| f.puts JSON.pretty_generate(doc) }
# Return the document's timestamp
doc['layer_modified_dt']
end
# Delete a document and its containing directory
# Takes a document id like 'stanford-bb058zh0946'
def delete_doc(doc_id)
# Find the nested directory structure for the document
tree_dirs = doc_id.match(DOC_ID_REGEX).captures.join('/')
return if tree_dirs.empty?
# Delete the directory if it exists
tree = File.expand_path("#{BASE_DIR}/#{tree_dirs}")
FileUtils.rm_rf(tree) if File.exist?(tree)
end
# Update everything
task :update do
puts '== Updating metadata for layers =='
with_timestamp do |previous_timestamp|
updated_docs_since(previous_timestamp) { |doc| write_doc_metadata(doc) }.last
end
# TODO: See https://github.com/OpenGeoMetadata/edu.stanford.purl/issues/143
# puts '== Checking for deleted layers =='
# total_deleted = deleted_docs { |doc_id| delete_doc(doc_id) }.compact.length
# puts total_deleted.zero? ? '== No deleted layers found ==' : "== Deleted #{total_deleted} layers =="
end
# Write a JSON file that maps each layer to its directory
# See: https://opengeometadata.org/share-on-ogm/#naming-by-metadata-standard
task :write_layers_json do
data = {}
# Crawl the directory structure and get each druid with its directory
Dir.glob('**/geoblacklight.json').sort.each do |file|
druid = File.dirname(file).split(%r{/}).drop(1).join
data["druid:#{druid}"] = File.dirname(file)
end
# Write the JSON to a file
File.open('./layers.json', 'w') { |f| f.puts JSON.pretty_generate(data) }
puts "== Wrote #{data.length} layers to layers.json =="
end