Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,38 @@
!.env.development
!.env.test
*~undo-tree~

# Ignore Gemfile.lock
Gemfile.lock

# Ignore coverage reports
coverage/*

# Ignore bundler config.
.bundle

# Ignore JetBrains IDE folder
.idea/*

# Ignore the default SQLite database.
/db/*.sqlite3
/db/*.sqlite3-journal

# Ignore all logfiles and tempfiles.
/log/*
/tmp/*
!/log/.keep
!/tmp/.keep
fits.log

# Ignore Byebug command history file.
.byebug_history

# ignore gem builds
*.gem
pkg/
*~undo-tree~
.DS_Store

# ignore local config settings
.vscode/settings.json
14 changes: 2 additions & 12 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,5 @@
source 'https://rubygems.org'
git_source(:github) { |repo| "git@github.com:#{repo}.git" }

gem 'aws-sdk-s3'
gem 'aws-sdk-sqs'
gem 'dotenv'
gem 'httparty'
gem 'nokogiri'
gem 'pry', group: %i[development test]
gem 'rake', group: %i[development test]
gem 'rspec', group: %i[development test]
gem 'rubocop', group: %i[development test]
gem 'rubocop-rake', group: %i[development test]
gem 'rubocop-rspec', group: %i[development test]
gem 'rubyzip'
# Specify your gem's dependencies in space_stone-pdf_splitter.gemspec
gemspec
30 changes: 16 additions & 14 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
PATH
remote: .
specs:
space_stone (0.1.0)
aws-sdk-s3
aws-sdk-sqs
dotenv
httparty
nokogiri
rubyzip

GEM
remote: https://rubygems.org/
specs:
ast (2.4.2)
aws-eventstream (1.2.0)
aws-partitions (1.635.0)
aws-sdk-core (3.153.0)
aws-partitions (1.636.0)
aws-sdk-core (3.154.0)
aws-eventstream (~> 1, >= 1.0.2)
aws-partitions (~> 1, >= 1.525.0)
aws-sigv4 (~> 1.1)
Expand Down Expand Up @@ -34,19 +45,15 @@ GEM
mime-types-data (~> 3.2015)
mime-types-data (3.2022.0105)
multi_xml (0.6.0)
nokogiri (1.13.8-arm64-darwin)
racc (~> 1.4)
nokogiri (1.13.8-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.13.8-x86_64-linux)
nokogiri (1.14.2-arm64-darwin)
racc (~> 1.4)
parallel (1.22.1)
parser (3.1.2.1)
ast (~> 2.4.1)
pry (0.14.1)
coderay (~> 1.1)
method_source (~> 1.0)
racc (1.6.0)
racc (1.6.2)
rainbow (3.1.1)
rake (13.0.6)
regexp_parser (2.5.0)
Expand Down Expand Up @@ -89,18 +96,13 @@ PLATFORMS
x86_64-linux

DEPENDENCIES
aws-sdk-s3
aws-sdk-sqs
dotenv
httparty
nokogiri
pry
rake
rspec
rubocop
rubocop-rake
rubocop-rspec
rubyzip
space_stone!

BUNDLED WITH
2.2.26
12 changes: 12 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# frozen_string_literal: true

require 'bundler/gem_tasks'

# Rails.application.load_tasks

begin
require 'bundler/setup'
rescue LoadError
puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
end

Bundler::GemHelper.install_tasks

begin
require 'rspec/core/rake_task'

Expand Down
15 changes: 15 additions & 0 deletions bin/console
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "space_stone"

# You can add fixtures and/or initialization code here to make experimenting
# with your gem easier. You can also use a different console, if you like.

# (If you use this, don't forget to add pry to your Gemfile!)
# require "pry"
# Pry.start

require "irb"
IRB.start(__FILE__)
164 changes: 111 additions & 53 deletions lib/space_stone.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,71 +2,129 @@

require 'json'
require 'dotenv'
require_relative "./space_stone/version"
require_relative './space_stone/env'
require_relative './space_stone/ia_download'
require_relative './space_stone/ocrcelot'
require_relative './space_stone/s3_service'
require_relative './space_stone/sqs_service'

# Invokers
def download(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
puts "event: #{event.inspect}" unless SpaceStone::Env.test?
ia_ids = get_event_body(event: event)
results = {}

ia_ids.each do |ia_id|
jp2s = process_ia_id(ia_id.strip)
results[ia_id] = jp2s.map { |v| v.sub('/tmp/', '') }
puts %x{rm -rf /tmp/#{ia_id}}
end
send_results(results)
end
module SpaceStone
module Invoker
##
# @param command [Symbol] The name of the command you want to invoke
# @param env [#invoker_for] The configuration for the command to use for this instance of SpaceStone.
# @param scope [Module] The place where we'll look for a sub-module invoker.
# @param kwargs [Hash<Symbol,Object>]
#
# @note Why this arrangement? By the construction of a SpaceStone, you need methods in the
# global name space (e.g. in the Kernel). And each project will have it's own set of
# invokers. Some of those invokers would be re-used. For example the OCR invoker.
# However, the invoker for downloading is perhaps unique. What this allows for is a
# common repository to house scripts that might be generally userful and repurposable.
def self.invoke(command, scope: self, env: SpaceStone::Env, **kwargs)
env.invoker_for(command, scope: scope).call(**kwargs)
end

def ocr(event:, context:) # rubocop:disable Lint/UnusedMethodArgument
puts "event: #{event.inspect}" unless SpaceStone::Env.test?
s3_locations = get_event_body(event: event)
results = []
s3_locations.each do |s3_location|
path = SpaceStone::S3Service.download(s3_location)
ocr_path = SpaceStone::Ocrcelot.new(path: path).ocr
results << ocr_path
SpaceStone::S3Service.upload(ocr_path)
puts "remove tmp files:"
puts %x{rm -v #{path} #{ocr_path}}
rescue Aws::S3::Errors::NotFound
puts "file #{s3_location} not found. skipping"
end
# @abstract
class Base
def self.call(event:, context:)
new(event: event, context: context).call
end

send_results(results)
end
def initialize(event:, context:)
@event = event
@context = context
@body = body_from(event: @event)
end
attr_reader :event, :context, :body

def call
response_for(body: invoke)
end

def invoke
raise NotImplementedError
end

private

def response_for(body)
{
statusCode: 200,
headers: [{ 'Content-Type' => 'application/json' }],
body: body
}.to_json
end

# Helpers
def process_ia_id(ia_id)
FileUtils.mkdir_p("/tmp/#{ia_id}")
# download zip file
ia_download = SpaceStone::IaDownload.new(id: ia_id)
downloads = ia_download.download_jp2s
downloads += ia_download.dataset_files
downloads.each do |path|
SpaceStone::S3Service.upload(path)
SpaceStone::SqsService.add(message: path.sub('/tmp/', ''), queue: 'ocr') if path.match(/jp2$/)
def body_from(event:)
if event['Records']
event['Records'].map { |r| JSON.parse(r['body']) }.flatten
elsif event['isBase64Encoded']
JSON.parse(Base64.decode64(event['body']))
else
event['body']
end
end
end

class DownloadInternetArchive < Base
def invoke
puts "event: #{event.inspect}" unless SpaceStone::Env.test?
ia_ids = body
results = {}

ia_ids.each do |ia_id|
jp2s = process_ia_id(ia_id.strip)
results[ia_id] = jp2s.map { |v| v.sub('/tmp/', '') }
puts %x{rm -rf /tmp/#{ia_id}}
end

results
end

private

def process_ia_id(ia_id)
FileUtils.mkdir_p("/tmp/#{ia_id}")
# download zip file
ia_download = SpaceStone::IaDownload.new(id: ia_id)
downloads = ia_download.download_jp2s
downloads += ia_download.dataset_files
downloads.each do |path|
SpaceStone::S3Service.upload(path)
SpaceStone::SqsService.add(message: path.sub('/tmp/', ''), queue: 'ocr') if path.match(/jp2$/)
end
end
end

class Ocr < Base
def invoke
puts "event: #{event.inspect}" unless SpaceStone::Env.test?
s3_locations = body
results = []
s3_locations.each do |s3_location|
path = SpaceStone::S3Service.download(s3_location)
ocr_path = SpaceStone::Ocrcelot.new(path: path).ocr
results << ocr_path
SpaceStone::S3Service.upload(ocr_path)
puts "remove tmp files:"
puts %x{rm -v #{path} #{ocr_path}}
rescue Aws::S3::Errors::NotFound
puts "file #{s3_location} not found. skipping"
end

results
end
end
end
end

def get_event_body(event:)
if event['Records']
event['Records'].map { |r| JSON.parse(r['body']) }.flatten
elsif event['isBase64Encoded']
JSON.parse(Base64.decode64(event['body']))
else
event['body']
end
# Invokers
def download(event:, context:)
SpaceStone::Invoker.invoke(:download, event: event, context: context)
end

def send_results(results)
{
statusCode: 200,
headers: [{ 'Content-Type' => 'application/json' }],
body: results
}.to_json
def ocr(event:, context:)
SpaceStone::Invoker.invoke(:ocr, event: event, context: context)
end
23 changes: 23 additions & 0 deletions lib/space_stone/env.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,29 @@
module SpaceStone
# Load env files for various lambda envs
module Env
##
# @param command [Symbol]
# @param scope [Module]
def invoker_for(command, scope:)
invoker_constant_name =
case command
when :download
(ENV['INVOKER__DOWNLOAD'] || :DownloadInternetArchive).to_sym
when :ocr
(ENV['INVOKER__OCR'] || :Ocr).to_sym
else
raise "Unexpected command #{command}"
end
scope.const_get(invoker_constant_name)
end

# What is the project that this is associated with?
def project
# The original "space stone" project was "nnp". For backwards compatability, I'm going using
# that so as to create the least disruption.
ENV['SPACE_STONE_PROJECT'] || 'nnp'
end

def stage
ENV['STAGE_ENV'] || 'development'
end
Expand Down
5 changes: 5 additions & 0 deletions lib/space_stone/version.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# frozen_string_literal: true

module SpaceStone
VERSION = "0.1.0"
end
Loading