Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add setting to only process a limited number of objects per run (DBAI-51) #46

Merged
merged 4 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ SETTINGS_DRY_RUN=false
# Limit for the size (in bytes) of objects to be processed (optional)
# This is useful for development or in environments where larger files cannot be processed.
SETTINGS_OBJECT_SIZE_LIMIT=
# Number of objects per repository (e.g. Archivematica instance) to process at once (optional)
SETTINGS_NUM_OBJECTS_PER_REPOSITORY=

# Repository
# Name that will be used for the repository at the beginning of bag identifiers
Expand Down
4 changes: 3 additions & 1 deletion lib/archivematica.rb
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ def get_package_data_objects(stored_date:, package_filter: AllPackageFilter.new)
logger.debug("Archivematica instance: #{@name}")
packages = @api.get_packages(location_uuid: @location_uuid, stored_date: stored_date)
filtered_packages = package_filter.filter(packages)
filtered_packages.map { |package| create_package_data_object(package) }
filtered_packages
.map { |package| create_package_data_object(package) }
.sort_by(&:stored_time)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As noted in the modified tests, my testing indicated that the order returned is ascending order by stored_date, but I've added this as a precaution. Batching would be severely broken (i.e. we'd miss packages) if the ordering changed, and the sort here seems like a low computation cost for at most hundreds of hashes/objects.

end
end
end
4 changes: 4 additions & 0 deletions lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def get_subset_by_key_stem(stem)
:remove_export,
:dry_run,
:object_size_limit,
:num_objects_per_repo,
keyword_init: true
)

Expand Down Expand Up @@ -287,6 +288,9 @@ def self.create_config(data)
remove_export: data.get_value(key: "SETTINGS_REMOVE_EXPORT", checks: [BOOLEAN_CHECK]) == "true",
object_size_limit: data.get_value(
key: "SETTINGS_OBJECT_SIZE_LIMIT", checks: [IntegerCheck.new], optional: true
)&.to_i,
num_objects_per_repo: data.get_value(
key: "SETTINGS_NUM_OBJECTS_PER_REPOSITORY", checks: [IntegerCheck.new], optional: true
)&.to_i
),
repository: RepositoryConfig.new(
Expand Down
8 changes: 7 additions & 1 deletion run_dark_blue.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ module ExtraBagInfoData

def initialize(config)
@package_repo = RepositoryPackageRepository::RepositoryPackageRepositoryFactory.for(use_db: DB)
@settings = config.settings
@dispatcher = Dispatcher::APTrustDispatcher.new(
settings: config.settings,
settings: @settings,
repository: config.repository,
target_client: RemoteClient::RemoteClientFactory.from_config(
type: config.aptrust.remote.type,
Expand Down Expand Up @@ -140,6 +141,11 @@ def process_arch_instance(arch_config)
stored_date: max_updated_at,
**(@object_size_limit ? {package_filter: Archivematica::SizePackageFilter.new(@object_size_limit)} : {})
)

if @settings.num_objects_per_repo && package_data_objs.length > @settings.num_objects_per_repo
package_data_objs = package_data_objs.take(@settings.num_objects_per_repo)
end

package_data_objs.each do |package_data|
logger.debug(package_data)
created = @package_repo.create(
Expand Down
55 changes: 28 additions & 27 deletions test/test_archivematica.rb
Original file line number Diff line number Diff line change
Expand Up @@ -200,20 +200,21 @@ def setup
@location_uuid = SecureRandom.uuid
@stored_date = Time.utc(2024, 2, 17)

@test_packages = [
Package.new(
uuid: "0948e2ae-eb24-4984-a71b-43bc440534d0",
path: "/storage/0948/e2ae/eb24/4984/a71b/43bc/4405/34d0/identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0",
size: 200000,
stored_date: "2024-02-18T00:00:00.000000"
),
Package.new(
uuid: "0baa468e-dd42-49ff-ba90-5dedc30c8541",
path: "/storage/0baa/468e/dd42/49ff/ba90/5ded/c30c/8541/identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541",
size: 500000000,
stored_date: "2024-02-19T00:00:00.000000"
)
]
@first_package = Package.new(
uuid: "0948e2ae-eb24-4984-a71b-43bc440534d0",
path: "/storage/0948/e2ae/eb24/4984/a71b/43bc/4405/34d0/identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0",
size: 200000,
stored_date: "2024-02-18T00:00:00.000000"
)
@second_package = Package.new(
uuid: "0baa468e-dd42-49ff-ba90-5dedc30c8541",
path: "/storage/0baa/468e/dd42/49ff/ba90/5ded/c30c/8541/identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541",
size: 500000000,
stored_date: "2024-02-19T00:00:00.000000"
)
# We expect packages to be in ascending order by stored date,
# but switching it here to ensure our code corrects it if another order occurs
@test_packages = [@second_package, @first_package]

@service = ArchivematicaService.new(
name: "test",
Expand All @@ -223,30 +224,31 @@ def setup
end

def test_get_package_data_objects_with_no_filter
@mock_api.expect(:get_packages, @test_packages, location_uuid: @location_uuid, stored_date: @stored_date)
@mock_api.expect(
:get_packages, @test_packages, location_uuid: @location_uuid, stored_date: @stored_date
)
package_data_objs = @service.get_package_data_objects(stored_date: @stored_date)
@mock_api.verify

# No objects are filtered out
first_package, second_package = @test_packages
expected = [
RepositoryPackageData.new(
remote_path: first_package.path,
remote_path: @first_package.path,
dir_name: "identifier-one-0948e2ae-eb24-4984-a71b-43bc440534d0",
metadata: ObjectMetadata.new(
id: first_package.uuid,
title: "#{first_package.uuid} / identifier-one",
id: @first_package.uuid,
title: "#{@first_package.uuid} / identifier-one",
creator: "Not available",
description: "Not available"
),
stored_time: Time.utc(2024, 2, 18)
),
RepositoryPackageData.new(
remote_path: second_package.path,
remote_path: @second_package.path,
dir_name: "identifier-two-0baa468e-dd42-49ff-ba90-5dedc30c8541",
metadata: ObjectMetadata.new(
id: second_package.uuid,
title: "#{second_package.uuid} / identifier-two",
id: @second_package.uuid,
title: "#{@second_package.uuid} / identifier-two",
creator: "Not available",
description: "Not available"
),
Expand All @@ -266,16 +268,15 @@ def test_get_package_data_objects_with_size_filter

# Larger object is filtered out
assert_equal 1, package_data_objs.length
assert_equal package_data_objs[0].metadata.id, @test_packages[0].uuid
assert_equal @first_package.uuid, package_data_objs[0].metadata.id
end

def test_get_package_data_object_when_exists
first_package = @test_packages[0]
@mock_api.expect(:get_package, first_package, [first_package.uuid])
package_data_obj = @service.get_package_data_object(first_package.uuid)
@mock_api.expect(:get_package, @first_package, [@first_package.uuid])
package_data_obj = @service.get_package_data_object(@first_package.uuid)
@mock_api.verify
assert package_data_obj.is_a?(RepositoryPackageData)
assert_equal first_package.path, package_data_obj.remote_path
assert_equal @first_package.path, package_data_obj.remote_path
end

def test_get_package_data_object_when_does_not_exist
Expand Down
Loading