Skip to content

Commit

Permalink
Report indexing status to SDR during geo indexing
Browse files Browse the repository at this point in the history
See #1299
  • Loading branch information
thatbudakguy committed Jan 24, 2024
1 parent e0491c8 commit 5658c54
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 19 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ gem 'activesupport', '~> 7.0'
gem 'slop'

gem 'factory_bot', '~> 6.2'
gem 'dor-event-client'
14 changes: 14 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,17 @@ GEM
public_suffix (>= 2.0.2, < 6.0)
airbrussh (1.5.1)
sshkit (>= 1.6.1, != 1.7.0)
amq-protocol (2.3.2)
ast (2.4.2)
base64 (0.2.0)
bigdecimal (3.1.6)
builder (3.2.4)
bundler-audit (0.9.1)
bundler (>= 1.2.0, < 3)
thor (~> 1.0)
bunny (2.22.0)
amq-protocol (~> 2.3, >= 2.3.1)
sorted_set (~> 1, >= 1.0.2)
capistrano (3.18.0)
airbrussh (>= 1.0.0)
i18n
Expand Down Expand Up @@ -81,6 +85,10 @@ GEM
capistrano-shared_configs
docile (1.4.0)
domain_name (0.6.20240107)
dor-event-client (1.0.0)
activesupport (>= 4.2, < 8)
bunny (~> 2.17)
zeitwerk (~> 2.1)
dor-rights-auth (1.8.0)
nokogiri
dot-properties (0.1.4)
Expand Down Expand Up @@ -228,6 +236,7 @@ GEM
zeitwerk (~> 2.6)
rainbow (3.1.1)
rake (13.1.0)
rbtree (0.4.6)
rdoc (6.6.2)
psych (>= 4.0.0)
regexp_parser (2.9.0)
Expand Down Expand Up @@ -266,13 +275,17 @@ GEM
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
scrub_rb (1.0.1)
set (1.1.0)
simplecov (0.22.0)
docile (~> 1.1)
simplecov-html (~> 0.11)
simplecov_json_formatter (~> 0.1)
simplecov-html (0.12.3)
simplecov_json_formatter (0.1.4)
slop (4.10.1)
sorted_set (1.0.3)
rbtree
set (~> 1.0)
sshkit (1.22.0)
mutex_m
net-scp (>= 1.1.2)
Expand Down Expand Up @@ -336,6 +349,7 @@ DEPENDENCIES
debouncer
debug
dlss-capistrano
dor-event-client
dor-rights-auth
factory_bot (~> 6.2)
honeybadger
Expand Down
8 changes: 8 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
kafka:
hosts: localhost:9092

sdr_events:
enabled: true
mq:
vhost: /
hostname: localhost
username: guest
password: guest

environments:
folio_test:
database_url: <%= ENV['FOLIO_TEST_DATABASE_URL'] || ENV['DATABASE_URL'] %>
Expand Down
56 changes: 56 additions & 0 deletions lib/sdr_events.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# frozen_string_literal: true

require 'socket'

# Reports indexing events to SDR via message queue
# See: https://github.com/sul-dlss/dor-event-client
# See also the HTTP API: https://sul-dlss.github.io/dor-services-app/#tag/events
class SdrEvents
class << self
def configure(
hostname: ENV.fetch('SDR_EVENTS_MQ_HOSTNAME', ::Settings.sdr_events.mq.hostname),
vhost: ENV.fetch('SDR_EVENTS_MQ_VHOST', ::Settings.sdr_events.mq.vhost),
username: ENV.fetch('SDR_EVENTS_MQ_USERNAME', ::Settings.sdr_events.mq.username),
password: ENV.fetch('SDR_EVENTS_MQ_PASSWORD', ::Settings.sdr_events.mq.password)
)
Dor::Event::Client.configure(hostname:, vhost:, username:, password:)
end

def enabled?
::Settings.sdr_events.enabled == true
end

# Item was added/updated in the index
def report_indexing_success(druid)
create_event(type: 'indexing_success', druid:)
end

# Item was removed from the index (e.g. via unrelease)
def report_indexing_deleted(druid)
create_event(type: 'indexing_deleted', druid:)
end

# Item has missing or inappropriately formatted metadata
def report_indexing_skipped(druid, message:)
create_event(type: 'indexing_skipped', druid:, data: { message: })
end

# Exception was raised during indexing; provides optional context
def report_indexing_errored(druid, message:, context: nil)
create_event(type: 'indexing_errored', druid:, data: { message:, context: }.compact)
end

# Generic event creation; prefer more specific methods
def create_event(druid:, type:, data: {})
Dor::Event::Client.create(type:, druid:, data: base_data.merge(data)) if enabled?
end

# Logged with every event
def base_data
@base_data ||= {
host: Socket.gethostname,
invoked_by: 'indexer'
}
end
end
end
53 changes: 34 additions & 19 deletions lib/traject/config/geo_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def log_skip(context)

indexer = self

SdrEvents.configure

# rubocop:disable Metrics/BlockLength
settings do
provide 'writer_class_name', 'Traject::SolrBetterJsonWriter'
Expand Down Expand Up @@ -97,10 +99,13 @@ def log_skip(context)
provide 'solr_json_writer.http_client', (HTTPClient.new.tap { |x| x.receive_timeout = 600 })
provide 'solr_json_writer.skippable_exceptions', [HTTPClient::TimeoutError, StandardError]

provide 'mapping_rescue', (lambda do |context, e|
Honeybadger.notify(e, context: { record: context.record_inspect, index_step: context.index_step.inspect })
provide 'mapping_rescue', (lambda do |traject_context, err|
context = { record: traject_context.record_inspect, index_step: traject_context.index_step.inspect }

Honeybadger.notify(err, context:)
SdrEvents.report_indexing_errored(traject_context.source_record_id, message: err.message, context:)

indexer.send(:default_mapping_rescue).call(context, e)
indexer.send(:default_mapping_rescue).call(traject_context, err)
end)
end
# rubocop:enable Metrics/BlockLength
Expand Down Expand Up @@ -148,26 +153,35 @@ def geoserver_url(record)
context.clipboard[:benchmark_start_time] = Time.now
end

##
# Skip records that have a delete field
each_record do |record, context|
if record.is_a?(Hash) && record[:delete]
context.output_hash['id'] = [record[:id].sub('druid:', '')]
context.skip!('Delete')
end
next unless record.is_a?(Hash) && record[:delete]

context.output_hash['id'] = [record[:id].sub('druid:', '')]
SdrEvents.report_indexing_deleted(record.druid)
context.skip!('Delete')
end

to_field 'dc_identifier_s' do |record, accumulator|
accumulator << "#{settings['purl.url']}/#{record.druid}"
# Skip records with no public XML
each_record do |record, context|
next if record.public_xml?

message = 'Item is in processing or does not exist'
SdrEvents.report_indexing_skipped(record.druid, message:)
context.skip!(message)
end

# Skip records with content types that we can't index
each_record do |record, context|
context.skip!('This item is in processing or does not exist') unless record.public_xml?
next if %w[image map book geo file].include?(record.dor_content_type) || record.collection?

context.skip!(
"This content type: #{record.dor_content_type} is not supported"
)
message = "This content type: #{record.dor_content_type} is not supported"
SdrEvents.report_indexing_skipped(record.druid, message:)
context.skip!(message)
end

to_field 'dc_identifier_s' do |record, accumulator|
accumulator << "#{settings['purl.url']}/#{record.druid}"
end

to_field 'dc_title_s', stanford_mods(:sw_short_title, default: '[Untitled]')
Expand Down Expand Up @@ -426,23 +440,24 @@ def geoserver_url(record)
end
end

each_record do |_record, context|
each_record do |record, context|
# Make sure that this field is single valued. GeoBlacklight at the moment only
# supports single valued srpt
if context.output_hash['solr_geom'].present?
context.output_hash['solr_geom'] = context.output_hash['solr_geom'].first
else
context.skip!(
"No ENVELOPE available for #{context.output_hash['id']}"
)
message = "No ENVELOPE available for #{context.output_hash['id']}"
SdrEvents.report_indexing_skipped(record.druid, message:)
context.skip!(message)
end
end

each_record do |_record, context|
each_record do |record, context|
t0 = context.clipboard[:benchmark_start_time]
t1 = Time.now

logger.debug('geo_config.rb') { "Processed #{context.output_hash['id']} (#{t1 - t0}s)" }
SdrEvents.report_indexing_success(record.druid)
end

# rubocop:disable Metrics/MethodLength
Expand Down

0 comments on commit 5658c54

Please sign in to comment.