From 5658c545c7ac3f93ac0401a58a4e550610c1391c Mon Sep 17 00:00:00 2001 From: Nick Budak Date: Tue, 23 Jan 2024 16:15:16 -0800 Subject: [PATCH] Report indexing status to SDR during geo indexing See #1299 --- Gemfile | 1 + Gemfile.lock | 14 ++++++++ config/settings.yml | 8 +++++ lib/sdr_events.rb | 56 ++++++++++++++++++++++++++++++++ lib/traject/config/geo_config.rb | 53 +++++++++++++++++++----------- 5 files changed, 113 insertions(+), 19 deletions(-) create mode 100644 lib/sdr_events.rb diff --git a/Gemfile b/Gemfile index 7a896bdd8..0d880a590 100644 --- a/Gemfile +++ b/Gemfile @@ -43,3 +43,4 @@ gem 'activesupport', '~> 7.0' gem 'slop' gem 'factory_bot', '~> 6.2' +gem 'dor-event-client' diff --git a/Gemfile.lock b/Gemfile.lock index 27449751f..014bce88f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -31,6 +31,7 @@ GEM public_suffix (>= 2.0.2, < 6.0) airbrussh (1.5.1) sshkit (>= 1.6.1, != 1.7.0) + amq-protocol (2.3.2) ast (2.4.2) base64 (0.2.0) bigdecimal (3.1.6) @@ -38,6 +39,9 @@ GEM bundler-audit (0.9.1) bundler (>= 1.2.0, < 3) thor (~> 1.0) + bunny (2.22.0) + amq-protocol (~> 2.3, >= 2.3.1) + sorted_set (~> 1, >= 1.0.2) capistrano (3.18.0) airbrussh (>= 1.0.0) i18n @@ -81,6 +85,10 @@ GEM capistrano-shared_configs docile (1.4.0) domain_name (0.6.20240107) + dor-event-client (1.0.0) + activesupport (>= 4.2, < 8) + bunny (~> 2.17) + zeitwerk (~> 2.1) dor-rights-auth (1.8.0) nokogiri dot-properties (0.1.4) @@ -228,6 +236,7 @@ GEM zeitwerk (~> 2.6) rainbow (3.1.1) rake (13.1.0) + rbtree (0.4.6) rdoc (6.6.2) psych (>= 4.0.0) regexp_parser (2.9.0) @@ -266,6 +275,7 @@ GEM ruby-progressbar (1.13.0) ruby2_keywords (0.0.5) scrub_rb (1.0.1) + set (1.1.0) simplecov (0.22.0) docile (~> 1.1) simplecov-html (~> 0.11) @@ -273,6 +283,9 @@ GEM simplecov-html (0.12.3) simplecov_json_formatter (0.1.4) slop (4.10.1) + sorted_set (1.0.3) + rbtree + set (~> 1.0) sshkit (1.22.0) mutex_m net-scp (>= 1.1.2) @@ -336,6 +349,7 @@ DEPENDENCIES debouncer debug dlss-capistrano + dor-event-client dor-rights-auth factory_bot (~> 6.2) honeybadger diff --git a/config/settings.yml b/config/settings.yml index 69eb89b94..f0ecabaee 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -1,6 +1,14 @@ kafka: hosts: localhost:9092 +sdr_events: + enabled: true + mq: + vhost: / + hostname: localhost + username: guest + password: guest + environments: folio_test: database_url: <%= ENV['FOLIO_TEST_DATABASE_URL'] || ENV['DATABASE_URL'] %> diff --git a/lib/sdr_events.rb b/lib/sdr_events.rb new file mode 100644 index 000000000..32163fc11 --- /dev/null +++ b/lib/sdr_events.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require 'socket' + +# Reports indexing events to SDR via message queue +# See: https://github.com/sul-dlss/dor-event-client +# See also the HTTP API: https://sul-dlss.github.io/dor-services-app/#tag/events +class SdrEvents + class << self + def configure( + hostname: ENV.fetch('SDR_EVENTS_MQ_HOSTNAME', ::Settings.sdr_events.mq.hostname), + vhost: ENV.fetch('SDR_EVENTS_MQ_VHOST', ::Settings.sdr_events.mq.vhost), + username: ENV.fetch('SDR_EVENTS_MQ_USERNAME', ::Settings.sdr_events.mq.username), + password: ENV.fetch('SDR_EVENTS_MQ_PASSWORD', ::Settings.sdr_events.mq.password) + ) + Dor::Event::Client.configure(hostname:, vhost:, username:, password:) + end + + def enabled? + ::Settings.sdr_events.enabled == true + end + + # Item was added/updated in the index + def report_indexing_success(druid) + create_event(type: 'indexing_success', druid:) + end + + # Item was removed from the index (e.g. via unrelease) + def report_indexing_deleted(druid) + create_event(type: 'indexing_deleted', druid:) + end + + # Item has missing or inappropriately formatted metadata + def report_indexing_skipped(druid, message:) + create_event(type: 'indexing_skipped', druid:, data: { message: }) + end + + # Exception was raised during indexing; provides optional context + def report_indexing_errored(druid, message:, context: nil) + create_event(type: 'indexing_errored', druid:, data: { message:, context: }.compact) + end + + # Generic event creation; prefer more specific methods + def create_event(druid:, type:, data: {}) + Dor::Event::Client.create(type:, druid:, data: base_data.merge(data)) if enabled? + end + + # Logged with every event + def base_data + @base_data ||= { + host: Socket.gethostname, + invoked_by: 'indexer' + } + end + end +end diff --git a/lib/traject/config/geo_config.rb b/lib/traject/config/geo_config.rb index 2dbdf0d9c..fe93f37bb 100644 --- a/lib/traject/config/geo_config.rb +++ b/lib/traject/config/geo_config.rb @@ -65,6 +65,8 @@ def log_skip(context) indexer = self +SdrEvents.configure + # rubocop:disable Metrics/BlockLength settings do provide 'writer_class_name', 'Traject::SolrBetterJsonWriter' @@ -97,10 +99,13 @@ def log_skip(context) provide 'solr_json_writer.http_client', (HTTPClient.new.tap { |x| x.receive_timeout = 600 }) provide 'solr_json_writer.skippable_exceptions', [HTTPClient::TimeoutError, StandardError] - provide 'mapping_rescue', (lambda do |context, e| - Honeybadger.notify(e, context: { record: context.record_inspect, index_step: context.index_step.inspect }) + provide 'mapping_rescue', (lambda do |traject_context, err| + context = { record: traject_context.record_inspect, index_step: traject_context.index_step.inspect } + + Honeybadger.notify(err, context:) + SdrEvents.report_indexing_errored(traject_context.source_record_id, message: err.message, context:) - indexer.send(:default_mapping_rescue).call(context, e) + indexer.send(:default_mapping_rescue).call(traject_context, err) end) end # rubocop:enable Metrics/BlockLength @@ -148,26 +153,35 @@ def geoserver_url(record) context.clipboard[:benchmark_start_time] = Time.now end -## # Skip records that have a delete field each_record do |record, context| - if record.is_a?(Hash) && record[:delete] - context.output_hash['id'] = [record[:id].sub('druid:', '')] - context.skip!('Delete') - end + next unless record.is_a?(Hash) && record[:delete] + + context.output_hash['id'] = [record[:id].sub('druid:', '')] + SdrEvents.report_indexing_deleted(record.druid) + context.skip!('Delete') end -to_field 'dc_identifier_s' do |record, accumulator| - accumulator << "#{settings['purl.url']}/#{record.druid}" +# Skip records with no public XML +each_record do |record, context| + next if record.public_xml? + + message = 'Item is in processing or does not exist' + SdrEvents.report_indexing_skipped(record.druid, message:) + context.skip!(message) end +# Skip records with content types that we can't index each_record do |record, context| - context.skip!('This item is in processing or does not exist') unless record.public_xml? next if %w[image map book geo file].include?(record.dor_content_type) || record.collection? - context.skip!( - "This content type: #{record.dor_content_type} is not supported" - ) + message = "This content type: #{record.dor_content_type} is not supported" + SdrEvents.report_indexing_skipped(record.druid, message:) + context.skip!(message) +end + +to_field 'dc_identifier_s' do |record, accumulator| + accumulator << "#{settings['purl.url']}/#{record.druid}" end to_field 'dc_title_s', stanford_mods(:sw_short_title, default: '[Untitled]') @@ -426,23 +440,24 @@ def geoserver_url(record) end end -each_record do |_record, context| +each_record do |record, context| # Make sure that this field is single valued. GeoBlacklight at the moment only # supports single valued srpt if context.output_hash['solr_geom'].present? context.output_hash['solr_geom'] = context.output_hash['solr_geom'].first else - context.skip!( - "No ENVELOPE available for #{context.output_hash['id']}" - ) + message = "No ENVELOPE available for #{context.output_hash['id']}" + SdrEvents.report_indexing_skipped(record.druid, message:) + context.skip!(message) end end -each_record do |_record, context| +each_record do |record, context| t0 = context.clipboard[:benchmark_start_time] t1 = Time.now logger.debug('geo_config.rb') { "Processed #{context.output_hash['id']} (#{t1 - t0}s)" } + SdrEvents.report_indexing_success(record.druid) end # rubocop:disable Metrics/MethodLength