diff --git a/lib/traject/config/geo_config.rb b/lib/traject/config/geo_config.rb index d3a0bbbf3..ca6f68c85 100644 --- a/lib/traject/config/geo_config.rb +++ b/lib/traject/config/geo_config.rb @@ -1,3 +1,8 @@ +# frozen_string_literal: true + +# rubocop:disable Style/GlobalVars +# rubocop:disable Style/CombinableLoops + $LOAD_PATH << File.expand_path('../..', __dir__) require 'traject' @@ -12,6 +17,7 @@ require 'digest/md5' require 'active_support' +# Mappings for Dublin Core field values class GeoAuthorities def self.formats { @@ -20,6 +26,7 @@ def self.formats } end + # rubocop:disable Metrics/MethodLength def self.subjects { 'farming' => 'Farming', @@ -43,6 +50,7 @@ def self.subjects 'economy' => 'Economy' } end + # rubocop:enable Metrics/MethodLength def self.geometry_types { @@ -55,7 +63,10 @@ def self.geometry_types end Utils.logger = logger + +# rubocop:disable Style/MixinUsage extend Traject::SolrBetterJsonWriter::IndexerPatch +# rubocop:enable Style/MixinUsage def log_skip(context) writer.put(context) @@ -65,28 +76,39 @@ def log_skip(context) indexer = self +# rubocop:disable Metrics/BlockLength settings do provide 'writer_class_name', 'Traject::SolrBetterJsonWriter' provide 'solr.url', ENV['SOLR_URL'] - provide 'purl_fetcher.skip_catkey', false provide 'solr_better_json_writer.debounce_timeout', 5 + + # These parameters are expected on the command line if you want to connect to a kafka topic: + # provide 'kafka.topic' + # provide 'kafka.consumer_group_id' if self['kafka.topic'] - provide "reader_class_name", "Traject::KafkaPurlFetcherReader" + provide 'reader_class_name', 'Traject::KafkaPurlFetcherReader' consumer = Utils.kafka.consumer(group_id: self['kafka.consumer_group_id'] || 'traject', fetcher_max_queue_size: 15) consumer.subscribe(self['kafka.topic']) provide 'kafka.consumer', consumer else - provide "reader_class_name", "Traject::DruidReader" + provide 'reader_class_name', 'Traject::DruidReader' end - provide 'purl_fetcher.target', ENV.fetch('PURL_FETCHER_TARGET', 'Earthworks') provide 'purl.url', ENV.fetch('PURL_URL', 'https://purl.stanford.edu') + provide 'stacks.url', ENV.fetch('STACKS_URL', 'https://stacks.stanford.edu') + provide 'geoserver.pub_url', ENV.fetch('GEOSERVER_PUB_URL', 'https://geowebservices.stanford.edu/geoserver') + provide 'geoserver.stan_url', ENV.fetch('GEOSERVER_STAN_URL', 'https://geowebservices-restricted.stanford.edu/geoserver') + + provide 'purl_fetcher.target', ENV.fetch('PURL_FETCHER_TARGET', 'Earthworks') + provide 'purl_fetcher.skip_catkey', ENV['PURL_FETCHER_SKIP_CATKEY'] + self['purl_fetcher.skip_catkey'] = self['purl_fetcher.skip_catkey'] != 'false' + provide 'solr_writer.commit_on_close', true if defined?(JRUBY_VERSION) require 'traject/manticore_http_client' provide 'solr_json_writer.http_client', Traject::ManticoreHttpClient.new else - provide 'solr_json_writer.http_client', HTTPClient.new.tap { |x| x.receive_timeout = 600 } + provide 'solr_json_writer.http_client', (HTTPClient.new.tap { |x| x.receive_timeout = 600 }) end provide 'solr_json_writer.skippable_exceptions', [HTTPClient::TimeoutError, StandardError] @@ -96,6 +118,7 @@ def log_skip(context) indexer.send(:default_mapping_rescue).call(context, e) end) end +# rubocop:enable Metrics/BlockLength def stanford_mods(method, *args, default: nil) lambda do |resource, accumulator, _context| @@ -120,7 +143,7 @@ def mods_display(method, *args, default: nil) data = Array(resource.mods_display.public_send(method, *args)) data.each do |v| - v.values.each do |v2| + v.each_value do |v2| accumulator << v2.to_s end end @@ -129,14 +152,14 @@ def mods_display(method, *args, default: nil) end end -module Constants - GEOWEBSERVICES = { - 'Public': 'https://geowebservices.stanford.edu/geoserver', - 'Restricted': 'https://geowebservices-restricted.stanford.edu/geoserver' - } +# Get the right geoserver url for a record given its access rights +def geoserver_url(record) + return settings['geoserver.stan_url'] if record.stanford_only? + + settings['geoserver.pub_url'] end -each_record do |record, context| +each_record do |_record, context| context.clipboard[:benchmark_start_time] = Time.now end @@ -150,14 +173,16 @@ module Constants end to_field 'dc_identifier_s' do |record, accumulator| - accumulator << "http://purl.stanford.edu/#{record.druid}" + accumulator << "#{settings['purl.url']}/#{record.druid}" end each_record do |record, context| context.skip!('This item is in processing or does not exist') unless record.public_xml? + next if %w[image map book geo file].include?(record.dor_content_type) || record.is_collection + context.skip!( "This content type: #{record.dor_content_type} is not supported" - ) unless (%w[image map book geo file].include?(record.dor_content_type) || record.is_collection) + ) end to_field 'dc_title_s', stanford_mods(:sw_short_title, default: '[Untitled]') @@ -179,7 +204,7 @@ module Constants end end -to_field 'layer_geom_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |record, accumulator| +to_field 'layer_geom_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |_record, accumulator| data = accumulator.flatten.select { |v| v.text =~ /#/ }.map { |v| v.text.split('#', 2).last }.slice(0..0) data.map! { |v| GeoAuthorities.geometry_types.fetch(v, v) } accumulator.replace(data) @@ -196,12 +221,12 @@ module Constants accumulator << record.public_xml_doc.root.attr('published') end -to_field 'dct_issued_s', mods_xpath('mods:originInfo/mods:dateIssued') do |record, accumulator| +to_field 'dct_issued_s', mods_xpath('mods:originInfo/mods:dateIssued') do |_record, accumulator| data = accumulator.flatten.map(&:text).slice(0..0) accumulator.replace(data) end -to_field 'dc_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |record, accumulator| +to_field 'dc_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |_record, accumulator| data = accumulator.flatten.map(&:text).uniq.map { |v| v.split('#', 2).first }.slice(0..0) accumulator.replace(data) end @@ -212,9 +237,11 @@ module Constants accumulator << 'Image' if %w[image map book].include?(record.dor_content_type) end -to_field 'dc_format_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:format') do |record, accumulator| - data = accumulator.flatten.map(&:text).select { |v| v =~ /format=/ }.map { |v| v.split('format=', 2).last }.slice(0..0) - if (data.present?) +to_field 'dc_format_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:format') do |_record, accumulator| + data = accumulator.flatten.map(&:text) + .select { |v| v =~ /format=/ } + .map { |v| v.split('format=', 2).last }.slice(0..0) + if data.present? accumulator.replace(data.uniq.map { |v| GeoAuthorities.formats.fetch(v, v) }) else accumulator.uniq! @@ -230,10 +257,10 @@ module Constants end to_field 'dc_language_s', stanford_mods(:sw_language_facet), first_only -to_field 'dc_subject_sm', stanford_mods(:subject_other_search) do |record, accumulator| - accumulator.map! { |val| val.sub(/[\\,;]$/, '').strip if val } +to_field 'dc_subject_sm', stanford_mods(:subject_other_search) do |_record, accumulator| + accumulator.map! { |val| val&.sub(/[\\,;]$/, '')&.strip } end -to_field 'dc_subject_sm', mods_xpath('mods:subject/mods:topic') do |record, accumulator| +to_field 'dc_subject_sm', mods_xpath('mods:subject/mods:topic') do |_record, accumulator| accumulator.flatten! accumulator.map! do |val| if val.attr('authority') =~ /ISO19115topicCategory/i @@ -264,50 +291,68 @@ module Constants to_field 'layer_availability_score_f', literal(1.0) to_field 'geoblacklight_version', literal('1.0') + +# rubocop:disable Metrics/BlockLength +# rubocop:disable Layout/LineLength to_field 'dct_references_s' do |record, accumulator, context| references = { - 'http://schema.org/url' => "https://purl.stanford.edu/#{record.druid}", - 'http://www.loc.gov/mods/v3' => "https://purl.stanford.edu/#{record.druid}.mods", + 'http://schema.org/url' => "#{settings['purl.url']}/#{record.druid}", + 'http://www.loc.gov/mods/v3' => "#{settings['purl.url']}/#{record.druid}.mods" } case record.dor_content_type when 'file' - references.merge!({ - 'https://oembed.com' => "https://purl.stanford.edu/embed.json?&hide_title=true&url=https://purl.stanford.edu/#{record.druid}", - }) + references.merge!( + { + 'https://oembed.com' => "#{settings['purl.url']}/embed.json?&hide_title=true&url=#{settings['purl.url']}/#{record.druid}" + } + ) when 'image', 'map', 'book' - references.merge!({ - 'https://oembed.com' => "https://purl.stanford.edu/embed.json?&hide_title=true&url=https://purl.stanford.edu/#{record.druid}", - 'http://iiif.io/api/presentation#manifest' => "https://purl.stanford.edu/#{record.druid}/iiif/manifest" - }) + references.merge!( + { + 'https://oembed.com' => "#{settings['purl.url']}/embed.json?&hide_title=true&url=#{settings['purl.url']}/#{record.druid}", + 'http://iiif.io/api/presentation#manifest' => "#{settings['purl.url']}/#{record.druid}/iiif/manifest" + } + ) when 'geo' formats = context.output_hash['dc_format_s'] || [] - references.merge!({ - 'http://schema.org/downloadUrl' => "https://stacks.stanford.edu/file/druid:#{record.druid}/data.zip", - 'http://www.opengis.net/def/serviceType/ogc/wms' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wms", - 'http://www.isotc211.org/schemas/2005/gmd/' => "https://raw.githubusercontent.com/OpenGeoMetadata/edu.stanford.purl/master/#{record.druid_tree}/iso19139.xml", - }) + references.merge!( + { + 'http://schema.org/downloadUrl' => "#{settings['stacks.url']}/file/druid:#{record.druid}/data.zip", + 'http://www.opengis.net/def/serviceType/ogc/wms' => "#{geoserver_url(record)}/wms", + 'http://www.isotc211.org/schemas/2005/gmd/' => "https://raw.githubusercontent.com/OpenGeoMetadata/edu.stanford.purl/master/#{record.druid_tree}/iso19139.xml" + } + ) if formats.include?('Shapefile') - references.merge!({ - 'http://www.opengis.net/def/serviceType/ogc/wfs' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wfs", - }) + references.merge!( + { + 'http://www.opengis.net/def/serviceType/ogc/wfs' => "#{geoserver_url(record)}/wfs" + } + ) elsif formats.include?('GeoTIFF') || formats.include?('ArcGRID') - references.merge!({ - 'http://www.opengis.net/def/serviceType/ogc/wcs' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wcs", - }) + references.merge!( + { + 'http://www.opengis.net/def/serviceType/ogc/wcs' => "#{geoserver_url(record)}/wcs" + } + ) end - index_map = record.public_xml_doc.xpath('//file[@id="index_map.json"]').length > 0 + index_map = !record.public_xml_doc.xpath('//file[@id="index_map.json"]').empty? if index_map - references.merge!({ - 'https://openindexmaps.org' => "https://stacks.stanford.edu/file/druid:#{record.druid}/index_map.json" - }) + references.merge!( + { + 'https://openindexmaps.org' => "#{settings['stacks.url']}/file/druid:#{record.druid}/index_map.json" + } + ) end end accumulator << references.to_json end +# rubocop:enable Metrics/BlockLength +# rubocop:enable Layout/LineLength + to_field 'solr_geom', stanford_mods(:geo_extensions_as_envelope) to_field 'solr_geom', stanford_mods(:coordinates_as_envelope) to_field 'layer_slug_s' do |record, accumulator| @@ -373,24 +418,25 @@ module Constants to_field 'dc_source_sm' do |record, accumulator| next unless record.dor_content_type == 'geo' - next unless record.collections && record.collections.any? + next unless record.collections&.any? record.collections.uniq.each do |collection| accumulator << "stanford-#{collection.druid}" end end -to_field 'dct_isPartOf_sm', mods_xpath('mods:relatedItem[@type="host"]/mods:titleInfo/mods:title') do |record, accumulator| +to_field 'dct_isPartOf_sm', + mods_xpath('mods:relatedItem[@type="host"]/mods:titleInfo/mods:title') do |_record, accumulator| accumulator.flatten! accumulator.map!(&:text) accumulator.uniq! end -each_record do |record, context| +each_record do |record, _context| $druid_title_cache[record.druid] = record.label if record.is_collection end -each_record do |record, context| +each_record do |_record, context| context.output_hash.select { |k, _v| k =~ /_struct$/ }.each do |k, v| context.output_hash[k] = Array(v).map { |x| JSON.generate(x) } end @@ -408,24 +454,23 @@ module Constants end end -each_record do |record, context| +each_record do |_record, context| t0 = context.clipboard[:benchmark_start_time] t1 = Time.now logger.debug('geo_config.rb') { "Processed #{context.output_hash['id']} (#{t1 - t0}s)" } end - +# rubocop:disable Metrics/MethodLength def trim_punctuation_when_preceded_by_two_word_characters_or_some_other_stuff(str) previous_str = nil until str == previous_str previous_str = str - str = str.strip.gsub(/ *([,\/;:])$/, '') - .sub(/(\w\w)\.$/, '\1') - .sub(/(\p{L}\p{L})\.$/, '\1') - .sub(/(\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?)\.$/, '\1') - + str = str.strip.gsub(%r{ *([,/;:])$}, '') + .sub(/(\w\w)\.$/, '\1') + .sub(/(\p{L}\p{L})\.$/u, '\1') + .sub(/(\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?)\.$/u, '\1') # single square bracket characters if they are the start and/or end # chars and there are no internal square brackets. @@ -438,3 +483,7 @@ def trim_punctuation_when_preceded_by_two_word_characters_or_some_other_stuff(st str end +# rubocop:enable Metrics/MethodLength + +# rubocop:enable Style/GlobalVars +# rubocop:enable Style/CombinableLoops diff --git a/spec/integration/geo_config_spec.rb b/spec/integration/geo_config_spec.rb index e8b29fa4d..6c2771040 100644 --- a/spec/integration/geo_config_spec.rb +++ b/spec/integration/geo_config_spec.rb @@ -35,7 +35,7 @@ def stub_mods_request(druid, body) stub_purl_request(druid, File.read(file_fixture("#{druid}.xml").to_s)) end it 'maps things to the right places' do - expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/dc482zx1528'], + expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/dc482zx1528'], 'dc_title_s' => ['Jōshū Kusatsu Onsenzu'], 'dc_rights_s' => ['Public'], 'layer_geom_type_s' => ['Image'], @@ -110,7 +110,7 @@ def stub_mods_request(druid, body) stub_purl_request(druid, File.read(file_fixture("#{druid}.xml").to_s)) end it 'maps the metadata' do - expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/vv853br8653'], + expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/vv853br8653'], 'dc_title_s' => ['Abundance Estimates of the Pacific Salmon Conservation Assessment Database, 1978-2008'], 'dct_provenance_s' => ['Stanford'], 'layer_geom_type_s' => ['Polygon'], @@ -231,7 +231,7 @@ def stub_mods_request(druid, body) end it 'has expected fields' do - expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/bq589tv8583'], + expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/bq589tv8583'], 'layer_geom_type_s' => ['Collection'] end it 'does not include a layer_id_s' do @@ -246,7 +246,7 @@ def stub_mods_request(druid, body) end it 'has expected fields' do - expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/pq479rm6462'], + expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/pq479rm6462'], 'dc_format_s' => ['Geodatabase'], 'layer_geom_type_s' => ['Mixed'] end