diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb index fed02b26..093982ff 100644 --- a/app/services/iiif_print/derivative_rodeo_service.rb +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -110,6 +110,20 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_ end # rubocop:enable Metrics/MethodLength + ## + # @api public + # + # Figure out the ancestor type and ancestor + def self.get_ancestor(filename: nil, file_set:) + # In the case of a page split from a PDF, we need to know the grandparent's identifier to + # find the file(s) in the DerivativeRodeo. + if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) + [IiifPrint.grandparent_for(file_set), :grandparent] + else + [IiifPrint.parent_for(file_set), :parent] + end + end + ## # @api public # @@ -128,25 +142,28 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_ # @param filename [String] # @return [String] the dirname (without any "/" we hope) def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:) - # In the case of a page split from a PDF, we need to know the grandparent's identifier to - # find the file(s) in the DerivativeRodeo. - ancestor = if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) - IiifPrint.grandparent_for(file_set) - else - IiifPrint.parent_for(file_set) - end + ancestor, ancestor_type = get_ancestor(filename: filename, file_set: file_set) + # Why might we not have an ancestor? In the case of grandparent_for, we may not yet have run # the create relationships job. We could sneak a peak in the table to maybe glean some insight. # However, read further the `else` clause to see the novel approach. + # rubocop:disable Style/GuardClause if ancestor - ancestor.public_send(parent_work_identifier_property_name) + message = "#{self.class}.#{__method__} #{file_set.class} ID=#{file_set.id} and filename: #{filename.inspect}" \ + "has #{ancestor_type} of #{ancestor.class} ID=#{ancestor.id}" + Rails.logger.info(message) + ancestor.public_send(parent_work_identifier_property_name) || + raise("Expected #{ancestor.class} ID=#{ancestor.id} (#{ancestor_type} of #{file_set.class} ID=#{file_set.id}) " \ + "to have a present #{parent_work_identifier_property_name.inspect}") else # HACK: This makes critical assumptions about how we're creating the title for the file_set; # but we don't have much to fall-back on. Consider making this a configurable function. Or # perhaps this entire method should be more configurable. # TODO: Revisit this implementation. - file_set.title.first.split(".").first + file_set.title.first.split(".").first || + raise("#{file_set.class} ID=#{file_set.id} has title #{file_set.title.first} from which we cannot infer information.") end + # rubocop:enable Style/GuardClause end def initialize(file_set) diff --git a/lib/iiif_print/split_pdfs/base_splitter.rb b/lib/iiif_print/split_pdfs/base_splitter.rb index 42524149..4b859d8d 100644 --- a/lib/iiif_print/split_pdfs/base_splitter.rb +++ b/lib/iiif_print/split_pdfs/base_splitter.rb @@ -21,6 +21,8 @@ class BaseSplitter # # @note We're including the ** args to provide method conformity; other services require # additional information (such as the FileSet) + # + # @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter def self.call(path, **) new(path).to_a end diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb index e019873d..286ed315 100644 --- a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb @@ -14,11 +14,20 @@ class DerivativeRodeoSplitter # @param filename [String] the local path to the PDFDerivativeServicele # @param file_set [FileSet] file set containing the PDF file to split # - # @return [Array] paths to images split from each page of PDF file + # @return [Array] paths to images split from each page of PDF file + # + # @see IiifPrint::SplitPdfs::BaseSplitter def self.call(filename, file_set:) new(filename, file_set: file_set).split_files end + ## + # @param filename [String] path to the original file. Note that we use {#filename} to + # derivate {#input_uri} + # @param file_set [FileSet] the container for the original file and its derivatives. + # + # @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir` + # we're creating a sudirectory on `Dir.tmpdir` def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) @filename = filename @file_set = file_set @@ -68,7 +77,6 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir) # @return [String] # # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63 - # rubocop:disable Metrics/AbcSize # rubocop:disable Metrics/MethodLength def preprocessed_location_template return @preprocessed_location_template if defined?(@preprocessed_location_template) @@ -83,21 +91,7 @@ def preprocessed_location_template message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \ "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}" Rails.logger.warn(message) - # If the DerivativeRodeo doesn't know about the adapter for the import, this will raise - # an error. - # - # Since the file was not pre-processed, we're likely now going to be downloading that - # file and running all of the derivatives locally. - if rodeo_conformant_uri_exists?(file_set.import_url) - message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \ - "Perhaps there was a problem in SpaceStone downloading the file? Things should be okay." - Rails.logger.info(message) - file_set.import_url - else - message = "#{self.class}##{__method__} expected #{file_set.import_url.inspect} as specified " \ - "by #{file_set.class}#import_url to exist at remote location, but it did not." - raise MissingFileError, message - end + handle_original_file_not_in_derivative_rodeo else message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \ "nor a remote_url for #{file_set.class} ID=#{file_set.id}. Returning `nil' as we have no possible preprocess. " \ @@ -106,9 +100,36 @@ def preprocessed_location_template nil end end - # rubocop:enable Metrics/AbcSize # rubocop:enable Metrics/MethodLength + ## + # @api private + # + # When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to + # ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri} + # location. + # + # @return [String] should be the {#input_uri} + # @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist + def handle_original_file_not_in_derivative_rodeo + # A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator + return input_uri if rodeo_conformant_uri_exists?(input_uri) + + message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \ + "Perhaps there was a problem in SpaceStone downloading the file? " \ + "Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \ + "However, we'll almost certainly be generating child pages locally." + Rails.logger.info(message) + + # This ensures that we have a copy of the file_set.import_uri at the input_uri location; + # we likely have this. + DerivativeRodeo::Generators::CopyGenerator.new( + input_uris: [file_set.import_url], + output_location_template: input_uri + ).generated_uris.first + end + # private :handle_original_file_not_in_derivative_rodeo + def rodeo_conformant_uri_exists?(uri) DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist? end diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb index 5822bf5b..548044cc 100644 --- a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb @@ -18,7 +18,7 @@ it { is_expected.to respond_to(:call) } end - subject(:instance) { described_class.new(filename, file_set: file_set) } + subject(:instance) { described_class.new(filename, file_set: file_set, output_tmp_dir: Dir.tmpdir) } let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) } before do @@ -46,22 +46,25 @@ end end - context 'when the s3 file does not exist in the rodeo and the file sets import url exists' do + context 'when the s3 file does not exist in the rodeo and we have the local file' do it 'is the import_url' do + expect_any_instance_of(DerivativeRodeo::Generators::CopyGenerator).not_to receive(:generated_uris) file_set.import_url = import_url expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false) - expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(true) - expect(subject).to eq(file_set.import_url) + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(true) + expect(subject).to eq(instance.input_uri) end end - context 'when the s3 file does not exist and the given import url does NOT exist' do - it 'will raise a IiifPrint::MissingFileError' do + context 'when the s3 file does not exist and we do not have the input URI nor the given import url does NOT exist' do + let(:generator) { double(DerivativeRodeo::Generators::CopyGenerator, generated_uris: ["file:///generated/uri"]) } + it 'will invoke the DerivativeRodeo::Generators::CopyGenerator to bring the file locally' do + allow(DerivativeRodeo::Generators::CopyGenerator).to receive(:new).and_return(generator) file_set.import_url = import_url expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false) - expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(false) + expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(false) - expect { subject }.to raise_error(IiifPrint::MissingFileError) + expect(subject).to eq(generator.generated_uris.first) end end