Skip to content

Commit 7d2aeea

Browse files
committed
Add a job to harvest resources from a resourcesync resource list into a stream.
1 parent 542d5ce commit 7d2aeea

File tree

2 files changed

+81
-0
lines changed

2 files changed

+81
-0
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# frozen_string_literal: true
2+
3+
##
4+
# Background job to harvest a resource sync resourcelist into a stream
5+
class HarvestResourceListToStreamJob < ApplicationJob
6+
RS_MD_HASH_KEY = 'rs_md_hash'
7+
XMLNS = { sitemap: 'http://www.sitemaps.org/schemas/sitemap/0.9', rs: 'http://www.openarchives.org/rs/terms/' }.freeze
8+
9+
# rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
10+
def perform(stream, url:, access_token: Settings.resource_sync_harvest_token)
11+
auth_headers = access_token.present? ? { 'Authorization' => "Bearer #{access_token}" } : {}
12+
response = HTTP.get(url, headers: {}.merge(auth_headers))
13+
Rails.logger.info("HarvestResourceListToStreamJob: Fetched resource list from #{url} with status #{response.status}")
14+
15+
raise "Failed to fetch resource list from #{url}: #{response.status}" unless response.status.success?
16+
17+
resource_list = Nokogiri::XML(response.body.to_s)
18+
existing_files_hash = stream.uploads.with_attached_files.flat_map do |u|
19+
["md5:#{Base64.decode64(u.blob.checksum).unpack1('H*')}", u.blob.dig('metadata', RS_MD_HASH_KEY)]
20+
end.to_set
21+
22+
resource_list.xpath('//sitemap:url', **XMLNS).each do |url|
23+
loc = url.at_xpath('sitemap:loc', **XMLNS)&.text
24+
next if loc.blank?
25+
26+
filename = File.basename(URI.parse(loc))
27+
hash = url.at_xpath('rs:md/@hash', **XMLNS)&.text
28+
29+
next if existing_files_hash.include?(hash)
30+
31+
resource = HTTP.get(loc, headers: {}.merge(auth_headers))
32+
Rails.logger.info("HarvestResourceListToStreamJob: Fetched resource from #{loc} with status #{resource.status}")
33+
34+
tmpfile = create_tmpfile_for_http_response(resource.body)
35+
36+
test_checksum(tmpfile, hash)
37+
38+
upload = stream.uploads.build
39+
upload.files.attach(io: tmpfile, filename: filename, content_type: resource.headers['Content-Type'],
40+
metadata: { RS_MD_HASH_KEY => hash })
41+
42+
upload.save!
43+
44+
sleep 1
45+
end
46+
# rubocop:enable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
47+
end
48+
49+
def create_tmpfile_for_http_response(body)
50+
tmpfile = Tempfile.new binmode: true
51+
52+
body.each do |chunk|
53+
tmpfile.write(chunk)
54+
end
55+
tmpfile.rewind
56+
tmpfile
57+
end
58+
59+
def test_checksum(file, expected_checksum)
60+
return if expected_checksum.blank?
61+
62+
algorithm, expected_hash = expected_checksum.split(':', 2)
63+
actual_hash = case algorithm
64+
when 'md5'
65+
Digest::MD5.file(file.path).hexdigest
66+
when 'sha1'
67+
Digest::SHA1.file(file.path).hexdigest
68+
when 'sha256'
69+
Digest::SHA256.file(file.path).hexdigest
70+
else
71+
Rails.logger.warn "Unsupported checksum algorithm: #{algorithm}"
72+
return
73+
end
74+
75+
return if actual_hash == expected_hash
76+
77+
raise "Checksum mismatch: expected #{expected_checksum}, got #{algorithm}:#{actual_hash}"
78+
end
79+
end

config/settings.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,5 @@ oai_max_page_size: 5000
6060
oai_repository_id: 'pod.stanford.edu'
6161

6262
marc_record_writer_tmpdir: ~
63+
64+
resource_sync_harvest_token: ~

0 commit comments

Comments
 (0)