From 69347ecb349073f043993aa2ddbd0a8fc4093583 Mon Sep 17 00:00:00 2001 From: Samuel Sciolla Date: Fri, 26 Apr 2024 14:40:31 -0400 Subject: [PATCH 1/4] Add num objects setting and incorporate in process_arch_instance --- example.env | 2 ++ lib/config.rb | 6 +++++- run_dark_blue.rb | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/example.env b/example.env index c9845fd..47fc562 100644 --- a/example.env +++ b/example.env @@ -13,6 +13,8 @@ SETTINGS_DRY_RUN=false # Limit for the size (in bytes) of objects to be processed (optional) # This is useful for development or in environments where larger files cannot be processed. SETTINGS_OBJECT_SIZE_LIMIT= +# Number of objects per repository (e.g. Archivematica instance) to process in a run (optional) +SETTINGS_NUM_OBJECTS_PER_REPOSITORY= # Repository # Name that will be used for the repository at the beginning of bag identifiers diff --git a/lib/config.rb b/lib/config.rb index edae1ad..87e1293 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -109,6 +109,7 @@ def get_subset_by_key_stem(stem) :remove_export, :dry_run, :object_size_limit, + :num_objects_per_repo, keyword_init: true ) @@ -287,7 +288,10 @@ def self.create_config(data) remove_export: data.get_value(key: "SETTINGS_REMOVE_EXPORT", checks: [BOOLEAN_CHECK]) == "true", object_size_limit: data.get_value( key: "SETTINGS_OBJECT_SIZE_LIMIT", checks: [IntegerCheck.new], optional: true - )&.to_i + )&.to_i, + num_objects_per_repo: data.get_value( + key: SETTINGS_NUM_OBJECTS_PER_REPOSITORY, checks: [IntegerCheck.new], optional: true + ) ), repository: RepositoryConfig.new( name: data.get_value(key: "REPOSITORY_NAME"), diff --git a/run_dark_blue.rb b/run_dark_blue.rb index 93763bb..04f08e5 100644 --- a/run_dark_blue.rb +++ b/run_dark_blue.rb @@ -140,6 +140,11 @@ def process_arch_instance(arch_config) stored_date: max_updated_at, **(@object_size_limit ? {package_filter: Archivematica::SizePackageFilter.new(@object_size_limit)} : {}) ) + + if @settings.num_objects_per_repo && package_data_objs.length > @settings.num_objects_per_repo + package_data_objs = package_data_objs.take(@settings.num_objects_per_repo) + end + package_data_objs.each do |package_data| logger.debug(package_data) created = @package_repo.create( From 85b5f35f800ac642ffac4a58982d6bf66f959b5e Mon Sep 17 00:00:00 2001 From: Samuel Sciolla Date: Fri, 26 Apr 2024 14:45:15 -0400 Subject: [PATCH 2/4] Tweak language --- example.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.env b/example.env index 47fc562..2925000 100644 --- a/example.env +++ b/example.env @@ -13,7 +13,7 @@ SETTINGS_DRY_RUN=false # Limit for the size (in bytes) of objects to be processed (optional) # This is useful for development or in environments where larger files cannot be processed. SETTINGS_OBJECT_SIZE_LIMIT= -# Number of objects per repository (e.g. Archivematica instance) to process in a run (optional) +# Number of objects per repository (e.g. Archivematica instance) to process at once (optional) SETTINGS_NUM_OBJECTS_PER_REPOSITORY= # Repository From 3e8093a4215a382ff4b903a6b10e6fb80857963d Mon Sep 17 00:00:00 2001 From: Samuel Sciolla Date: Fri, 26 Apr 2024 15:33:51 -0400 Subject: [PATCH 3/4] Add explicit sorting; update/improve service tests --- lib/archivematica.rb | 4 ++- test/test_archivematica.rb | 55 +++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/lib/archivematica.rb b/lib/archivematica.rb index 1ba573c..3b76081 100644 --- a/lib/archivematica.rb +++ b/lib/archivematica.rb @@ -184,7 +184,9 @@ def get_package_data_objects(stored_date:, package_filter: AllPackageFilter.new) logger.debug("Archivematica instance: #{@name}") packages = @api.get_packages(location_uuid: @location_uuid, stored_date: stored_date) filtered_packages = package_filter.filter(packages) - filtered_packages.map { |package| create_package_data_object(package) } + filtered_packages + .map { |package| create_package_data_object(package) } + .sort_by(&:stored_time) end end end diff --git a/test/test_archivematica.rb b/test/test_archivematica.rb index 3f3379e..3a54175 100644 --- a/test/test_archivematica.rb +++ b/test/test_archivematica.rb @@ -200,20 +200,21 @@ def setup @location_uuid = SecureRandom.uuid @stored_date = Time.utc(2024, 2, 17) - @test_packages = [ - Package.new( - uuid: "0948e2ae-eb24-4984-a71b-43bc440534d0", - path: "/storage/0948/e2ae/eb24/4984/a71b/43bc/4405/34d0/identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0", - size: 200000, - stored_date: "2024-02-18T00:00:00.000000" - ), - Package.new( - uuid: "0baa468e-dd42-49ff-ba90-5dedc30c8541", - path: "/storage/0baa/468e/dd42/49ff/ba90/5ded/c30c/8541/identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541", - size: 500000000, - stored_date: "2024-02-19T00:00:00.000000" - ) - ] + @first_package = Package.new( + uuid: "0948e2ae-eb24-4984-a71b-43bc440534d0", + path: "/storage/0948/e2ae/eb24/4984/a71b/43bc/4405/34d0/identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0", + size: 200000, + stored_date: "2024-02-18T00:00:00.000000" + ) + @second_package = Package.new( + uuid: "0baa468e-dd42-49ff-ba90-5dedc30c8541", + path: "/storage/0baa/468e/dd42/49ff/ba90/5ded/c30c/8541/identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541", + size: 500000000, + stored_date: "2024-02-19T00:00:00.000000" + ) + # We expect packages to be in ascending order by stored date, + # but switching it here to ensure our code corrects it if another order occurs + @test_packages = [@second_package, @first_package] @service = ArchivematicaService.new( name: "test", @@ -223,30 +224,31 @@ def setup end def test_get_package_data_objects_with_no_filter - @mock_api.expect(:get_packages, @test_packages, location_uuid: @location_uuid, stored_date: @stored_date) + @mock_api.expect( + :get_packages, @test_packages, location_uuid: @location_uuid, stored_date: @stored_date + ) package_data_objs = @service.get_package_data_objects(stored_date: @stored_date) @mock_api.verify # No objects are filtered out - first_package, second_package = @test_packages expected = [ RepositoryPackageData.new( - remote_path: first_package.path, + remote_path: @first_package.path, dir_name: "identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0", metadata: ObjectMetadata.new( - id: first_package.uuid, - title: "#{first_package.uuid} / identifier-one", + id: @first_package.uuid, + title: "#{@first_package.uuid} / identifier-one", creator: "Not available", description: "Not available" ), stored_time: Time.utc(2024, 2, 18) ), RepositoryPackageData.new( - remote_path: second_package.path, + remote_path: @second_package.path, dir_name: "identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541", metadata: ObjectMetadata.new( - id: second_package.uuid, - title: "#{second_package.uuid} / identifier-two", + id: @second_package.uuid, + title: "#{@second_package.uuid} / identifier-two", creator: "Not available", description: "Not available" ), @@ -266,16 +268,15 @@ def test_get_package_data_objects_with_size_filter # Larger object is filtered out assert_equal 1, package_data_objs.length - assert_equal package_data_objs[0].metadata.id, @test_packages[0].uuid + assert_equal @first_package.uuid, package_data_objs[0].metadata.id end def test_get_package_data_object_when_exists - first_package = @test_packages[0] - @mock_api.expect(:get_package, first_package, [first_package.uuid]) - package_data_obj = @service.get_package_data_object(first_package.uuid) + @mock_api.expect(:get_package, @first_package, [@first_package.uuid]) + package_data_obj = @service.get_package_data_object(@first_package.uuid) @mock_api.verify assert package_data_obj.is_a?(RepositoryPackageData) - assert_equal first_package.path, package_data_obj.remote_path + assert_equal @first_package.path, package_data_obj.remote_path end def test_get_package_data_object_when_does_not_exist From e7c670db888626aa85b49f2b4d71be9dba808293 Mon Sep 17 00:00:00 2001 From: Samuel Sciolla Date: Fri, 26 Apr 2024 15:54:30 -0400 Subject: [PATCH 4/4] Fix bugs --- lib/config.rb | 4 ++-- run_dark_blue.rb | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/config.rb b/lib/config.rb index 87e1293..d9fbb53 100644 --- a/lib/config.rb +++ b/lib/config.rb @@ -290,8 +290,8 @@ def self.create_config(data) key: "SETTINGS_OBJECT_SIZE_LIMIT", checks: [IntegerCheck.new], optional: true )&.to_i, num_objects_per_repo: data.get_value( - key: SETTINGS_NUM_OBJECTS_PER_REPOSITORY, checks: [IntegerCheck.new], optional: true - ) + key: "SETTINGS_NUM_OBJECTS_PER_REPOSITORY", checks: [IntegerCheck.new], optional: true + )&.to_i ), repository: RepositoryConfig.new( name: data.get_value(key: "REPOSITORY_NAME"), diff --git a/run_dark_blue.rb b/run_dark_blue.rb index 04f08e5..6c86d7e 100644 --- a/run_dark_blue.rb +++ b/run_dark_blue.rb @@ -27,8 +27,9 @@ module ExtraBagInfoData def initialize(config) @package_repo = RepositoryPackageRepository::RepositoryPackageRepositoryFactory.for(use_db: DB) + @settings = config.settings @dispatcher = Dispatcher::APTrustDispatcher.new( - settings: config.settings, + settings: @settings, repository: config.repository, target_client: RemoteClient::RemoteClientFactory.from_config( type: config.aptrust.remote.type,