Skip to content

Commit 384c765

Browse files
committed
Add a job to harvest resources from a resourcesync resource list into a stream.
1 parent 542d5ce commit 384c765

File tree

2 files changed

+84
-0
lines changed

2 files changed

+84
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# frozen_string_literal: true
2+
3+
##
4+
# Background job to harvest a resource sync resourcelist into a stream
5+
class HarvestResourceListToStreamJob < ApplicationJob
6+
RS_MD_HASH_KEY = 'rs_md_hash'
7+
XMLNS = { sitemap: 'http://www.sitemaps.org/schemas/sitemap/0.9', rs: 'http://www.openarchives.org/rs/terms/' }.freeze
8+
9+
# rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
10+
def perform(stream, url:, access_token: Settings.resource_sync_harvest_token)
11+
http_client = access_token.present? ? HTTP.auth("Bearer #{access_token}") : HTTP
12+
13+
Rails.logger.info("HarvestResourceListToStreamJob: Fetching resource list from #{url}")
14+
response = http_client.get(url)
15+
raise "Failed to fetch resource list from #{url}: #{response.status}" unless response.status.success?
16+
17+
existing_files_hash = checksums_for_stream_files(stream)
18+
19+
resource_list = Nokogiri::XML(response.body.to_s)
20+
resource_list.xpath('//sitemap:url', **XMLNS).each do |url|
21+
loc = url.at_xpath('sitemap:loc', **XMLNS)&.text
22+
hash = url.at_xpath('rs:md/@hash', **XMLNS)&.text
23+
next if loc.blank? || existing_files_hash.include?(hash)
24+
25+
Rails.logger.info("HarvestResourceListToStreamJob: Fetching resource from #{loc}")
26+
resource = http_client.get(loc)
27+
tmpfile = create_tmpfile_for_http_response(resource.body)
28+
29+
test_checksum(tmpfile, hash)
30+
31+
upload = stream.uploads.build
32+
upload.files.attach(io: tmpfile,
33+
filename: File.basename(URI.parse(loc)),
34+
content_type: resource.headers['Content-Type'],
35+
metadata: { RS_MD_HASH_KEY => hash })
36+
37+
UploadCreatorService.call(upload)
38+
39+
sleep 1
40+
end
41+
# rubocop:enable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
42+
end
43+
44+
def checksums_for_stream_files(stream)
45+
stream.uploads.with_attached_files.flat_map do |u|
46+
u.files.map do |file|
47+
["md5:#{Base64.decode64(file.blob.checksum).unpack1('H*')}", file.blob.metadata[RS_MD_HASH_KEY]]
48+
end
49+
end.to_set
50+
end
51+
52+
def create_tmpfile_for_http_response(body)
53+
tmpfile = Tempfile.new binmode: true
54+
55+
body.each do |chunk|
56+
tmpfile.write(chunk)
57+
end
58+
tmpfile.rewind
59+
tmpfile
60+
end
61+
62+
def test_checksum(file, expected_checksum)
63+
return if expected_checksum.blank?
64+
65+
algorithm, expected_hash = expected_checksum.split(':', 2)
66+
actual_hash = case algorithm
67+
when 'md5'
68+
Digest::MD5.file(file.path).hexdigest
69+
when 'sha1'
70+
Digest::SHA1.file(file.path).hexdigest
71+
when 'sha256'
72+
Digest::SHA256.file(file.path).hexdigest
73+
else
74+
Rails.logger.warn "Unsupported checksum algorithm: #{algorithm}"
75+
return
76+
end
77+
78+
return if actual_hash == expected_hash
79+
80+
raise "Checksum mismatch: expected #{expected_checksum}, got #{algorithm}:#{actual_hash}"
81+
end
82+
end

config/settings.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,5 @@ oai_max_page_size: 5000
6060
oai_repository_id: 'pod.stanford.edu'
6161

6262
marc_record_writer_tmpdir: ~
63+
64+
resource_sync_harvest_token: ~

0 commit comments

Comments
 (0)