Skip to content

Commit bcbcf40

Browse files
authored
Direct upload of large files (#550)
* for direct upload to s3 * for direct upload (and download) to (and from) s3 * add the file_actor_decorator with the s3 upload code in it... 1GB threshold * we've already subclassed file_set_indexer so leave decorator alone for onel line change * pulled over * debug import url errors * turnoff puma-hang-on * more debug for importUrlJob * remove rescue get error * remove url_job_override, update RObs code to fit the Hyrax version we are using * add some debug to downloads controller to see what's up with filename * don't use the S3 URL to name the file * file_set.filename not file_set.file_name :/ * debug * remove debug, using @file_set.label for file.file_name when doing the external file attaching thing
1 parent 5274278 commit bcbcf40

File tree

8 files changed

+95
-21
lines changed

8 files changed

+95
-21
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
module Hyrax
2+
module Actors
3+
# Actions for a file identified by file_set and relation (maps to use predicate)
4+
# @note Spawns asynchronous jobs
5+
module FileActorDecorator
6+
def ingest_file(io)
7+
Rails.logger.error("[FileActor] starting write for #{file_set.id}")
8+
if io.size.to_i >= 1.gigabytes
9+
Rails.logger.error("[FileActor] Uploading directly to S3 for file_set #{file_set.id}")
10+
digest = `sha1sum #{io.path}`.split.first
11+
file_set.s3_only = digest
12+
s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], digest)
13+
s3_object.upload_file(io.path) unless s3_object.exists?
14+
Hydra::Works::AddExternalFileToFileSet.call(file_set, s3_object.public_url, relation)
15+
# how do we make sure the sha gets indexed?
16+
else
17+
Rails.logger.error("[FileActor] writing to fcrepo #{file_set.id}")
18+
# Skip versioning because versions will be minted by VersionCommitter as necessary during save_characterize_and_record_committer.
19+
Hydra::Works::AddFileToFileSet.call(file_set,
20+
io,
21+
relation,
22+
versioning: false)
23+
end
24+
return false unless file_set.save
25+
repository_file = related_file
26+
Hyrax::VersioningService.create(repository_file, user)
27+
pathhint = io.uploaded_file.uploader.path if io.uploaded_file # in case next worker is on same filesystem
28+
CharacterizeJob.perform_later(file_set, repository_file.id, pathhint || io.path)
29+
end
30+
end
31+
end
32+
end
33+
34+
Hyrax::Actors::FileActor.prepend(Hyrax::Actors::FileActorDecorator)

app/controllers/hyrax/downloads_controller.rb

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,29 @@ def item_identifier_for_irus_analytics
4141
# OVERRIDE Hyrax 2.9.6 allow downloading directly from S3
4242
def send_file_contents
4343
if ENV['S3_DOWNLOADS']
44-
s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
45-
redirect_to s3_object.presigned_url(:get, expires_in: 3600, response_content_disposition: "attachment\; filename=#{file.original_name}")
46-
else
47-
self.status = 200
48-
prepare_file_headers
49-
stream_body file.stream
44+
#s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
45+
s3_object = if asset.respond_to?(:s3_only) && asset.s3_only
46+
Aws::S3::Object.new(ENV['AWS_BUCKET'], asset.s3_only)
47+
else
48+
Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
49+
end
50+
if s3_object.exists?
51+
STDERR.puts "##################################"
52+
STDERR.puts "Redirecting to S3 using the filename #{file.original_name}"
53+
STDERR.puts "File object: #{file}"
54+
redirect_to s3_object.presigned_url(
55+
:get,
56+
expires_in: 3600,
57+
response_content_disposition: "attachment\; filename=#{file.original_name}"
58+
)
59+
return
60+
end
5061
end
62+
# from here on this is effectively `super` if this was a decorator
63+
# will fall back to streaming object via fedora
64+
self.status = 200
65+
prepare_file_headers
66+
stream_body file.stream
5167
end
5268

5369
# Override the Hydra::Controller::DownloadBehavior#content_options so that

app/indexers/file_set_indexer.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ def generate_solr_document
33

44
super.tap do |solr_doc|
55
solr_doc['hasFormat_ssim'] = object.rendering_ids
6+
solr_doc['digest_ssim'] = "urn:sha1:#{object.s3_only}" if object.s3_only.present?
67
end
78

89
rescue Ldp::HttpError => exception

app/models/file_set.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,12 @@
33
# Generated by hyrax:models:install
44
class FileSet < ActiveFedora::Base
55
include Ubiquity::UniversalMetadata
6+
7+
property :s3_only,
8+
predicate: ::RDF::URI("https://hykucommons.org/terms/s3_only"),
9+
multiple: false do |index|
10+
index.as :stored_searchable, :facetable
11+
end
12+
613
include ::Hyrax::FileSetBehavior
714
end

bin/web

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ if ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE'] && !ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE
33
%x{echo #{ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE']} | base64 --decode > prod-cred.p12}
44
end
55

6-
exec "bundle exec puma -v -b tcp://0.0.0.0:3000"
6+
exec "bundle exec puma -v -b tcp://0.0.0.0:3000"

config/environments/production.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848

4949
# Use the lowest log level to ensure availability of diagnostic information
5050
# when problems arise.
51-
config.log_level = :info
51+
config.log_level = :debug
5252

5353
# Prepend all log lines with the following tags.
5454
config.log_tags = [ :request_id ]
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# OVERRIDE Hydra-works 2.0.0 to deal with fcrepo + s3s inability to upload empty files
2+
3+
module Hydra
4+
module Works
5+
module UpdaterDecorator
6+
def attach_attributes(external_file_url, filename = nil)
7+
current_file.content = StringIO.new('-') # anything but blank
8+
# filename will be the url.... but we will use file_set.label
9+
# becuase making the filename the url of an s3 key is problematic for humans
10+
current_file.original_name = @file_set.label
11+
current_file.mime_type = "message/external-body; access-type=URL; URL=\"#{external_file_url}\""
12+
end
13+
end
14+
end
15+
end
16+
17+
Hydra::Works::AddExternalFileToFileSet::Updater.prepend(Hydra::Works::UpdaterDecorator)

ops/provision/main.tf

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,31 +38,30 @@ data "local_file" "efs_name" {
3838
filename = "efs_name"
3939
}
4040

41+
resource "helm_release" "aws-load-balancer" {
42+
chart = "aws-load-balancer-controller"
43+
name = "aws-load-balancer-controller"
44+
namespace = "kube-system"
45+
repository = "https://aws.github.io/eks-charts"
46+
set {
47+
name = "clusterName"
48+
value = "r2-bl"
49+
}
50+
}
51+
4152
resource "helm_release" "ingress-nginx" {
4253
name = "ingress-nginx"
4354
namespace = "ingress-nginx"
4455
create_namespace = true
4556
version = "4.5.2"
4657
repository = "https://kubernetes.github.io/ingress-nginx"
4758
chart = "ingress-nginx"
59+
depends_on = [helm_release.aws-load-balancer]
4860
values = [
4961
file("k8s/ingress-nginx-values.yaml")
5062
]
5163
}
5264

53-
resource "helm_release" "eks_efs_csi_driver" {
54-
chart = "aws-efs-csi-driver"
55-
name = "efs"
56-
namespace = "storage"
57-
create_namespace = true
58-
repository = "https://kubernetes-sigs.github.io/aws-efs-csi-driver/"
59-
60-
set {
61-
name = "image.repository"
62-
value = "602401143452.dkr.ecr.${var.region}.amazonaws.com/eks/aws-efs-csi-driver"
63-
}
64-
}
65-
6665
resource "kubernetes_storage_class" "storage_class" {
6766
storage_provisioner = "efs.csi.aws.com"
6867

0 commit comments

Comments
 (0)