From a544833b32dcf54545230253edbdf5cb16e9e060 Mon Sep 17 00:00:00 2001
From: Jeremy Friesen <jeremy.n.friesen@gmail.com>
Date: Wed, 8 Nov 2023 16:49:19 -0500
Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=90=9B=20Leverage=20CopyGenerator=20t?=
 =?UTF-8?q?o=20ensure=20we=20have=20local=20copy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to address the case where SpaceStone failed to copy the file
into S3.  When the file does not exist in S3, we can check if we have
the file locally and use that as the pre-processed location.  Otherwise,
we can copy the original remote file and proceed.

Related to:

- https://github.com/scientist-softserv/iiif_print/pull/283
- https://github.com/scientist-softserv/iiif_print/issues/282
---
 lib/iiif_print/split_pdfs/base_splitter.rb    |  2 +
 .../split_pdfs/derivative_rodeo_splitter.rb   | 59 +++++++++++++------
 .../derivative_rodeo_splitter_spec.rb         | 19 +++---
 3 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/lib/iiif_print/split_pdfs/base_splitter.rb b/lib/iiif_print/split_pdfs/base_splitter.rb
index 42524149..4b859d8d 100644
--- a/lib/iiif_print/split_pdfs/base_splitter.rb
+++ b/lib/iiif_print/split_pdfs/base_splitter.rb
@@ -21,6 +21,8 @@ class BaseSplitter
       #
       # @note We're including the ** args to provide method conformity; other services require
       #       additional information (such as the FileSet)
+      #
+      # @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter
       def self.call(path, **)
         new(path).to_a
       end
diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
index e019873d..1f9b9fd9 100644
--- a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
+++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
@@ -14,12 +14,21 @@ class DerivativeRodeoSplitter
       # @param filename [String] the local path to the PDFDerivativeServicele
       # @param file_set [FileSet] file set containing the PDF file to split
       #
-      # @return [Array] paths to images split from each page of PDF file
+      # @return [Array<String>] paths to images split from each page of PDF file
+      #
+      # @see IiifPrint::SplitPdfs::BaseSplitter
       def self.call(filename, file_set:)
         new(filename, file_set: file_set).split_files
       end
 
-      def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
+      ##
+      # @param filename [String] path to the original file.  Note that we use {#filename} to
+      #        derivate {#input_uri}
+      # @param file_set [FileSet] the container for the original file and its derivatives.
+      #
+      # @param output_tmp_dir [String] where we will be writing things.  In using `Dir.mktmpdir`
+      #        we're creating a sudirectory on `Dir.tmpdir`
+      def initialize(filename, file_set:, output_tmp_dir: Dir.mktmpdir)
         @filename = filename
         @file_set = file_set
 
@@ -68,7 +77,6 @@ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
       # @return [String]
       #
       # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
-      # rubocop:disable Metrics/AbcSize
       # rubocop:disable Metrics/MethodLength
       def preprocessed_location_template
         return @preprocessed_location_template if defined?(@preprocessed_location_template)
@@ -83,21 +91,7 @@ def preprocessed_location_template
             message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist.  " \
                       "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
             Rails.logger.warn(message)
-            # If the DerivativeRodeo doesn't know about the adapter for the import, this will raise
-            # an error.
-            #
-            # Since the file was not pre-processed, we're likely now going to be downloading that
-            # file and running all of the derivatives locally.
-            if rodeo_conformant_uri_exists?(file_set.import_url)
-              message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist.  " \
-                        "Perhaps there was a problem in SpaceStone downloading the file?  Things should be okay."
-              Rails.logger.info(message)
-              file_set.import_url
-            else
-              message = "#{self.class}##{__method__} expected #{file_set.import_url.inspect} as specified " \
-                        "by #{file_set.class}#import_url to exist at remote location, but it did not."
-              raise MissingFileError, message
-            end
+            handle_original_file_not_in_derivative_rodeo
           else
             message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
                       "nor a remote_url for #{file_set.class} ID=#{file_set.id}.  Returning `nil' as we have no possible preprocess.  " \
@@ -106,9 +100,36 @@ def preprocessed_location_template
             nil
           end
       end
-      # rubocop:enable Metrics/AbcSize
       # rubocop:enable Metrics/MethodLength
 
+      ##
+      # @api private
+      #
+      # When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
+      # ensure that we have something locally.  We copy the {FileSet#import_url} to the {#input_uri}
+      # location.
+      #
+      # @return [String] should be the {#input_uri}
+      # @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
+      def handle_original_file_not_in_derivative_rodeo
+        # A quick short-circuit.  Don't attempt to copy.  Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
+        return input_uri if rodeo_conformant_uri_exists?(input_uri)
+
+        message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist.  " \
+                  "Perhaps there was a problem in SpaceStone downloading the file?  " \
+                  "Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists.  " \
+                  "However, we'll almost certainly be generating child pages locally."
+        Rails.logger.info(message)
+
+        # This ensures that we have a copy of the file_set.import_uri at the input_uri location;
+        # we likely have this.
+        DerivativeRodeo::Generators::CopyGenerator.new(
+          input_uris: [file_set.import_url],
+          output_location_template: input_uri
+        ).generated_uris.first
+      end
+      # private :handle_original_file_not_in_derivative_rodeo
+
       def rodeo_conformant_uri_exists?(uri)
         DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
       end
diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb
index 5822bf5b..548044cc 100644
--- a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb
+++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb
@@ -18,7 +18,7 @@
     it { is_expected.to respond_to(:call) }
   end
 
-  subject(:instance) { described_class.new(filename, file_set: file_set) }
+  subject(:instance) { described_class.new(filename, file_set: file_set, output_tmp_dir: Dir.tmpdir) }
   let(:generator) { double(DerivativeRodeo::Generators::PdfSplitGenerator, generated_files: []) }
 
   before do
@@ -46,22 +46,25 @@
       end
     end
 
-    context 'when the s3 file does not exist in the rodeo and the file sets import url exists' do
+    context 'when the s3 file does not exist in the rodeo and we have the local file' do
       it 'is the import_url' do
+        expect_any_instance_of(DerivativeRodeo::Generators::CopyGenerator).not_to receive(:generated_uris)
         file_set.import_url = import_url
         expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
-        expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(true)
-        expect(subject).to eq(file_set.import_url)
+        expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(true)
+        expect(subject).to eq(instance.input_uri)
       end
     end
 
-    context 'when the s3 file does not exist and the given import url does NOT exist' do
-      it 'will raise a IiifPrint::MissingFileError' do
+    context 'when the s3 file does not exist and we do not have the input URI nor the given import url does NOT exist' do
+      let(:generator) { double(DerivativeRodeo::Generators::CopyGenerator, generated_uris: ["file:///generated/uri"]) }
+      it 'will invoke the DerivativeRodeo::Generators::CopyGenerator to bring the file locally' do
+        allow(DerivativeRodeo::Generators::CopyGenerator).to receive(:new).and_return(generator)
         file_set.import_url = import_url
         expect(instance).to receive(:rodeo_conformant_uri_exists?).with(derivative_rodeo_preprocessed_file).and_return(false)
-        expect(instance).to receive(:rodeo_conformant_uri_exists?).with(file_set.import_url).and_return(false)
+        expect(instance).to receive(:rodeo_conformant_uri_exists?).with(instance.input_uri).and_return(false)
 
-        expect { subject }.to raise_error(IiifPrint::MissingFileError)
+        expect(subject).to eq(generator.generated_uris.first)
       end
     end
 

From 505a37fa1639d7dcd9b33318aad918044e088488 Mon Sep 17 00:00:00 2001
From: Jeremy Friesen <jeremy.n.friesen@gmail.com>
Date: Thu, 9 Nov 2023 16:53:39 -0500
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=A7=B9=20Add=20more=20logging=20infor?=
 =?UTF-8?q?mation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/iiif_print/derivative_rodeo_service.rb | 13 +++++++++++--
 .../split_pdfs/derivative_rodeo_splitter.rb         |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb
index fed02b26..410abebe 100644
--- a/app/services/iiif_print/derivative_rodeo_service.rb
+++ b/app/services/iiif_print/derivative_rodeo_service.rb
@@ -130,22 +130,31 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_
     def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:)
       # In the case of a page split from a PDF, we need to know the grandparent's identifier to
       # find the file(s) in the DerivativeRodeo.
+      ancestor_type = nil
       ancestor = if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename)
+                   ancestor_type = :grandparent
                    IiifPrint.grandparent_for(file_set)
                  else
+                   ancestor_type = :parent
                    IiifPrint.parent_for(file_set)
                  end
       # Why might we not have an ancestor?  In the case of grandparent_for, we may not yet have run
       # the create relationships job.  We could sneak a peak in the table to maybe glean some insight.
       # However, read further the `else` clause to see the novel approach.
       if ancestor
-        ancestor.public_send(parent_work_identifier_property_name)
+        message = "#{self.class}.#{__method__} #{file_set.class} ID=#{file_set.id} and filename: #{filename.inspect}" \
+                  "has #{ancestor_type} of #{ancestor.class} ID=#{ancestor.id}"
+        Rails.logger.info(message)
+        ancestor.public_send(parent_work_identifier_property_name) ||
+          raise("Expected #{ancestor.class} ID=#{ancestor.id} (#{ancestor_type} of #{file_set.class} ID=#{file_set.id}) " \
+                "to have a present #{parent_work_identifier_property_name.inspect}")
       else
         # HACK: This makes critical assumptions about how we're creating the title for the file_set;
         # but we don't have much to fall-back on.  Consider making this a configurable function.  Or
         # perhaps this entire method should be more configurable.
         # TODO: Revisit this implementation.
-        file_set.title.first.split(".").first
+        file_set.title.first.split(".").first ||
+          raise("#{file_set.class} ID=#{file_set.id} has title #{file_set.title.first} from which we cannot infer information.")
       end
     end
 
diff --git a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
index 1f9b9fd9..286ed315 100644
--- a/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
+++ b/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb
@@ -28,7 +28,7 @@ def self.call(filename, file_set:)
       #
       # @param output_tmp_dir [String] where we will be writing things.  In using `Dir.mktmpdir`
       #        we're creating a sudirectory on `Dir.tmpdir`
-      def initialize(filename, file_set:, output_tmp_dir: Dir.mktmpdir)
+      def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
         @filename = filename
         @file_set = file_set
 

From 45954c41f9c64e1278577e5ab29c7c7d9aa206a2 Mon Sep 17 00:00:00 2001
From: Rob Kaufman <rob@notch8.com>
Date: Tue, 14 Nov 2023 20:20:28 -0800
Subject: [PATCH 3/3] rubocop fixes

---
 .../iiif_print/derivative_rodeo_service.rb    | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb
index 410abebe..093982ff 100644
--- a/app/services/iiif_print/derivative_rodeo_service.rb
+++ b/app/services/iiif_print/derivative_rodeo_service.rb
@@ -110,6 +110,20 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_
     end
     # rubocop:enable Metrics/MethodLength
 
+    ##
+    # @api public
+    #
+    # Figure out the ancestor type and ancestor
+    def self.get_ancestor(filename: nil, file_set:)
+      # In the case of a page split from a PDF, we need to know the grandparent's identifier to
+      # find the file(s) in the DerivativeRodeo.
+      if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename)
+        [IiifPrint.grandparent_for(file_set), :grandparent]
+      else
+        [IiifPrint.parent_for(file_set), :parent]
+      end
+    end
+
     ##
     # @api public
     #
@@ -128,19 +142,12 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_
     # @param filename [String]
     # @return [String] the dirname (without any "/" we hope)
     def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:)
-      # In the case of a page split from a PDF, we need to know the grandparent's identifier to
-      # find the file(s) in the DerivativeRodeo.
-      ancestor_type = nil
-      ancestor = if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename)
-                   ancestor_type = :grandparent
-                   IiifPrint.grandparent_for(file_set)
-                 else
-                   ancestor_type = :parent
-                   IiifPrint.parent_for(file_set)
-                 end
+      ancestor, ancestor_type = get_ancestor(filename: filename, file_set: file_set)
+
       # Why might we not have an ancestor?  In the case of grandparent_for, we may not yet have run
       # the create relationships job.  We could sneak a peak in the table to maybe glean some insight.
       # However, read further the `else` clause to see the novel approach.
+      # rubocop:disable Style/GuardClause
       if ancestor
         message = "#{self.class}.#{__method__} #{file_set.class} ID=#{file_set.id} and filename: #{filename.inspect}" \
                   "has #{ancestor_type} of #{ancestor.class} ID=#{ancestor.id}"
@@ -156,6 +163,7 @@ def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:)
         file_set.title.first.split(".").first ||
           raise("#{file_set.class} ID=#{file_set.id} has title #{file_set.title.first} from which we cannot infer information.")
       end
+      # rubocop:enable Style/GuardClause
     end
 
     def initialize(file_set)