|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +## |
| 4 | +# Background job to harvest a resource sync resourcelist into a stream |
| 5 | +class HarvestResourceListToStreamJob < ApplicationJob |
| 6 | + RS_MD_HASH_KEY = 'rs_md_hash' |
| 7 | + XMLNS = { sitemap: 'http://www.sitemaps.org/schemas/sitemap/0.9', rs: 'http://www.openarchives.org/rs/terms/' }.freeze |
| 8 | + |
| 9 | + # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity |
| 10 | + def perform(stream, url:, access_token: Settings.resource_sync_harvest_token) |
| 11 | + http_client = access_token.present? ? HTTP.auth("Bearer #{access_token}") : HTTP |
| 12 | + |
| 13 | + Rails.logger.info("HarvestResourceListToStreamJob: Fetching resource list from #{url}") |
| 14 | + response = http_client.get(url) |
| 15 | + raise "Failed to fetch resource list from #{url}: #{response.status}" unless response.status.success? |
| 16 | + |
| 17 | + existing_files_hash = checksums_for_stream_files(stream) |
| 18 | + |
| 19 | + resource_list = Nokogiri::XML(response.body.to_s) |
| 20 | + resource_list.xpath('//sitemap:url', **XMLNS).each do |url| |
| 21 | + loc = url.at_xpath('sitemap:loc', **XMLNS)&.text |
| 22 | + hash = url.at_xpath('rs:md/@hash', **XMLNS)&.text |
| 23 | + next if loc.blank? || existing_files_hash.include?(hash) |
| 24 | + |
| 25 | + Rails.logger.info("HarvestResourceListToStreamJob: Fetching resource from #{loc}") |
| 26 | + resource = http_client.get(loc) |
| 27 | + tmpfile = create_tmpfile_for_http_response(resource.body) |
| 28 | + |
| 29 | + test_checksum(tmpfile, hash) |
| 30 | + |
| 31 | + upload = stream.uploads.build |
| 32 | + upload.files.attach(io: tmpfile, |
| 33 | + filename: File.basename(URI.parse(loc)), |
| 34 | + content_type: resource.headers['Content-Type'], |
| 35 | + metadata: { RS_MD_HASH_KEY => hash }) |
| 36 | + |
| 37 | + UploadCreatorService.call(upload) |
| 38 | + |
| 39 | + sleep 1 |
| 40 | + end |
| 41 | + # rubocop:enable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity |
| 42 | + end |
| 43 | + |
| 44 | + def checksums_for_stream_files(stream) |
| 45 | + stream.uploads.with_attached_files.flat_map do |u| |
| 46 | + u.files.map do |file| |
| 47 | + ["md5:#{Base64.decode64(file.blob.checksum).unpack1('H*')}", file.blob.metadata[RS_MD_HASH_KEY]] |
| 48 | + end |
| 49 | + end.to_set |
| 50 | + end |
| 51 | + |
| 52 | + def create_tmpfile_for_http_response(body) |
| 53 | + tmpfile = Tempfile.new binmode: true |
| 54 | + |
| 55 | + body.each do |chunk| |
| 56 | + tmpfile.write(chunk) |
| 57 | + end |
| 58 | + tmpfile.rewind |
| 59 | + tmpfile |
| 60 | + end |
| 61 | + |
| 62 | + def test_checksum(file, expected_checksum) |
| 63 | + return if expected_checksum.blank? |
| 64 | + |
| 65 | + algorithm, expected_hash = expected_checksum.split(':', 2) |
| 66 | + actual_hash = case algorithm |
| 67 | + when 'md5' |
| 68 | + Digest::MD5.file(file.path).hexdigest |
| 69 | + when 'sha1' |
| 70 | + Digest::SHA1.file(file.path).hexdigest |
| 71 | + when 'sha256' |
| 72 | + Digest::SHA256.file(file.path).hexdigest |
| 73 | + else |
| 74 | + Rails.logger.warn "Unsupported checksum algorithm: #{algorithm}" |
| 75 | + return |
| 76 | + end |
| 77 | + |
| 78 | + return if actual_hash == expected_hash |
| 79 | + |
| 80 | + raise "Checksum mismatch: expected #{expected_checksum}, got #{algorithm}:#{actual_hash}" |
| 81 | + end |
| 82 | +end |
0 commit comments