Skip to content

Commit

Permalink
🐛 Leverage CopyGenerator to ensure we have local copy
Browse files Browse the repository at this point in the history
We need to address the case where SpaceStone failed to copy the file
into S3.  When the file does not exist in S3, we can check if we have
the file locally and use that as the pre-processed location.  Otherwise,
we can copy the original remote file and proceed.

Related to:

- #283
- #282
  • Loading branch information
jeremyf committed Nov 8, 2023
1 parent 56069b5 commit a544833
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 27 deletions.
2 changes: 2 additions & 0 deletions lib/iiif_print/split_pdfs/base_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class BaseSplitter
#
# @note We're including the ** args to provide method conformity; other services require
# additional information (such as the FileSet)
#
# @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter
def self.call(path, **)
new(path).to_a
end
Expand Down
59 changes: 40 additions & 19 deletions lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,21 @@ class DerivativeRodeoSplitter
# @param filename [String] the local path to the PDFDerivativeServicele
# @param file_set [FileSet] file set containing the PDF file to split
#
# @return [Array] paths to images split from each page of PDF file
# @return [Array<String>] paths to images split from each page of PDF file
#
# @see IiifPrint::SplitPdfs::BaseSplitter
def self.call(filename, file_set:)
new(filename, file_set: file_set).split_files
end

def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
##
# @param filename [String] path to the original file. Note that we use {#filename} to
# derivate {#input_uri}
# @param file_set [FileSet] the container for the original file and its derivatives.
#
# @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir`
# we're creating a sudirectory on `Dir.tmpdir`
def initialize(filename, file_set:, output_tmp_dir: Dir.mktmpdir)
@filename = filename
@file_set = file_set

Expand Down Expand Up @@ -68,7 +77,6 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
# @return [String]
#
# @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
# rubocop:disable Metrics/AbcSize
# rubocop:disable Metrics/MethodLength
def preprocessed_location_template
return @preprocessed_location_template if defined?(@preprocessed_location_template)
Expand All @@ -83,21 +91,7 @@ def preprocessed_location_template
message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
"Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
Rails.logger.warn(message)
# If the DerivativeRodeo doesn't know about the adapter for the import, this will raise
# an error.
#
# Since the file was not pre-processed, we're likely now going to be downloading that
# file and running all of the derivatives locally.
if rodeo_conformant_uri_exists?(file_set.import_url)
message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
"Perhaps there was a problem in SpaceStone downloading the file? Things should be okay."
Rails.logger.info(message)
file_set.import_url
else
message = "#{self.class}##{__method__} expected #{file_set.import_url.inspect} as specified " \
"by #{file_set.class}#import_url to exist at remote location, but it did not."
raise MissingFileError, message
end
handle_original_file_not_in_derivative_rodeo
else
message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
"nor a remote_url for #{file_set.class} ID=#{file_set.id}. Returning `nil' as we have no possible preprocess. " \
Expand All @@ -106,9 +100,36 @@ def preprocessed_location_template
nil
end
end
# rubocop:enable Metrics/AbcSize
# rubocop:enable Metrics/MethodLength

##
# @api private
#
# When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
# ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri}
# location.
#
# @return [String] should be the {#input_uri}
# @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
def handle_original_file_not_in_derivative_rodeo
# A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
return input_uri if rodeo_conformant_uri_exists?(input_uri)

message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
"Perhaps there was a problem in SpaceStone downloading the file? " \
"Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \
"However, we'll almost certainly be generating child pages locally."
Rails.logger.info(message)

# This ensures that we have a copy of the file_set.import_uri at the input_uri location;
# we likely have this.
DerivativeRodeo::Generators::CopyGenerator.new(
input_uris: [file_set.import_url],
output_location_template: input_uri
).generated_uris.first
end
# private :handle_original_file_not_in_derivative_rodeo

def rodeo_conformant_uri_exists?(uri)
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
end
Expand Down
19 changes: 11 additions & 8 deletions spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
it { is_expected.to respond_to(:call) }
end

subject(:instance) { described_class.new(filename, file_set: file_set) }
subject(:instance) { described_class.new(filename, file_set: file_set, output_tmp_dir: Dir.tmpdir) }
let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) }

before do
Expand Down Expand Up @@ -46,22 +46,25 @@
end
end

context 'when the s3 file does not exist in the rodeo and the file sets import url exists' do
context 'when the s3 file does not exist in the rodeo and we have the local file' do
it 'is the import_url' do
expect_any_instance_of(DerivativeRodeo::Generators::CopyGenerator).not_to receive(:generated_uris)
file_set.import_url = import_url
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(true)
expect(subject).to eq(file_set.import_url)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(true)
expect(subject).to eq(instance.input_uri)
end
end

context 'when the s3 file does not exist and the given import url does NOT exist' do
it 'will raise a IiifPrint::MissingFileError' do
context 'when the s3 file does not exist and we do not have the input URI nor the given import url does NOT exist' do
let(:generator) { double(DerivativeRodeo::Generators::CopyGenerator, generated_uris: ["file:///generated/uri"]) }
it 'will invoke the DerivativeRodeo::Generators::CopyGenerator to bring the file locally' do
allow(DerivativeRodeo::Generators::CopyGenerator).to receive(:new).and_return(generator)
file_set.import_url = import_url
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(false)
expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(false)

expect { subject }.to raise_error(IiifPrint::MissingFileError)
expect(subject).to eq(generator.generated_uris.first)
end
end

Expand Down

0 comments on commit a544833

Please sign in to comment.