Skip to content

Commit

Permalink
Merge pull request #744 from sul-dlss/ew-stage-indexing
Browse files Browse the repository at this point in the history
Make seams in geo config for pointing to staging servers
  • Loading branch information
thatbudakguy authored Jan 25, 2023
2 parents 98acf81 + 12c7c48 commit b2175d2
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 62 deletions.
165 changes: 107 additions & 58 deletions lib/traject/config/geo_config.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# frozen_string_literal: true

# rubocop:disable Style/GlobalVars
# rubocop:disable Style/CombinableLoops

$LOAD_PATH << File.expand_path('../..', __dir__)

require 'traject'
Expand All @@ -12,6 +17,7 @@
require 'digest/md5'
require 'active_support'

# Mappings for Dublin Core field values
class GeoAuthorities
def self.formats
{
Expand All @@ -20,6 +26,7 @@ def self.formats
}
end

# rubocop:disable Metrics/MethodLength
def self.subjects
{
'farming' => 'Farming',
Expand All @@ -43,6 +50,7 @@ def self.subjects
'economy' => 'Economy'
}
end
# rubocop:enable Metrics/MethodLength

def self.geometry_types
{
Expand All @@ -55,7 +63,10 @@ def self.geometry_types
end

Utils.logger = logger

# rubocop:disable Style/MixinUsage
extend Traject::SolrBetterJsonWriter::IndexerPatch
# rubocop:enable Style/MixinUsage

def log_skip(context)
writer.put(context)
Expand All @@ -65,28 +76,39 @@ def log_skip(context)

indexer = self

# rubocop:disable Metrics/BlockLength
settings do
provide 'writer_class_name', 'Traject::SolrBetterJsonWriter'
provide 'solr.url', ENV['SOLR_URL']
provide 'purl_fetcher.skip_catkey', false
provide 'solr_better_json_writer.debounce_timeout', 5

# These parameters are expected on the command line if you want to connect to a kafka topic:
# provide 'kafka.topic'
# provide 'kafka.consumer_group_id'
if self['kafka.topic']
provide "reader_class_name", "Traject::KafkaPurlFetcherReader"
provide 'reader_class_name', 'Traject::KafkaPurlFetcherReader'
consumer = Utils.kafka.consumer(group_id: self['kafka.consumer_group_id'] || 'traject', fetcher_max_queue_size: 15)
consumer.subscribe(self['kafka.topic'])
provide 'kafka.consumer', consumer
else
provide "reader_class_name", "Traject::DruidReader"
provide 'reader_class_name', 'Traject::DruidReader'
end

provide 'purl_fetcher.target', ENV.fetch('PURL_FETCHER_TARGET', 'Earthworks')
provide 'purl.url', ENV.fetch('PURL_URL', 'https://purl.stanford.edu')
provide 'stacks.url', ENV.fetch('STACKS_URL', 'https://stacks.stanford.edu')
provide 'geoserver.pub_url', ENV.fetch('GEOSERVER_PUB_URL', 'https://geowebservices.stanford.edu/geoserver')
provide 'geoserver.stan_url', ENV.fetch('GEOSERVER_STAN_URL', 'https://geowebservices-restricted.stanford.edu/geoserver')

provide 'purl_fetcher.target', ENV.fetch('PURL_FETCHER_TARGET', 'Earthworks')
provide 'purl_fetcher.skip_catkey', ENV['PURL_FETCHER_SKIP_CATKEY']
self['purl_fetcher.skip_catkey'] = self['purl_fetcher.skip_catkey'] != 'false'

provide 'solr_writer.commit_on_close', true
if defined?(JRUBY_VERSION)
require 'traject/manticore_http_client'
provide 'solr_json_writer.http_client', Traject::ManticoreHttpClient.new
else
provide 'solr_json_writer.http_client', HTTPClient.new.tap { |x| x.receive_timeout = 600 }
provide 'solr_json_writer.http_client', (HTTPClient.new.tap { |x| x.receive_timeout = 600 })
end
provide 'solr_json_writer.skippable_exceptions', [HTTPClient::TimeoutError, StandardError]

Expand All @@ -96,6 +118,7 @@ def log_skip(context)
indexer.send(:default_mapping_rescue).call(context, e)
end)
end
# rubocop:enable Metrics/BlockLength

def stanford_mods(method, *args, default: nil)
lambda do |resource, accumulator, _context|
Expand All @@ -120,7 +143,7 @@ def mods_display(method, *args, default: nil)
data = Array(resource.mods_display.public_send(method, *args))

data.each do |v|
v.values.each do |v2|
v.each_value do |v2|
accumulator << v2.to_s
end
end
Expand All @@ -129,14 +152,14 @@ def mods_display(method, *args, default: nil)
end
end

module Constants
GEOWEBSERVICES = {
'Public': 'https://geowebservices.stanford.edu/geoserver',
'Restricted': 'https://geowebservices-restricted.stanford.edu/geoserver'
}
# Get the right geoserver url for a record given its access rights
def geoserver_url(record)
return settings['geoserver.stan_url'] if record.stanford_only?

settings['geoserver.pub_url']
end

each_record do |record, context|
each_record do |_record, context|
context.clipboard[:benchmark_start_time] = Time.now
end

Expand All @@ -150,14 +173,16 @@ module Constants
end

to_field 'dc_identifier_s' do |record, accumulator|
accumulator << "http://purl.stanford.edu/#{record.druid}"
accumulator << "#{settings['purl.url']}/#{record.druid}"
end

each_record do |record, context|
context.skip!('This item is in processing or does not exist') unless record.public_xml?
next if %w[image map book geo file].include?(record.dor_content_type) || record.is_collection

context.skip!(
"This content type: #{record.dor_content_type} is not supported"
) unless (%w[image map book geo file].include?(record.dor_content_type) || record.is_collection)
)
end

to_field 'dc_title_s', stanford_mods(:sw_short_title, default: '[Untitled]')
Expand All @@ -179,7 +204,7 @@ module Constants
end
end

to_field 'layer_geom_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |record, accumulator|
to_field 'layer_geom_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |_record, accumulator|
data = accumulator.flatten.select { |v| v.text =~ /#/ }.map { |v| v.text.split('#', 2).last }.slice(0..0)
data.map! { |v| GeoAuthorities.geometry_types.fetch(v, v) }
accumulator.replace(data)
Expand All @@ -196,12 +221,12 @@ module Constants
accumulator << record.public_xml_doc.root.attr('published')
end

to_field 'dct_issued_s', mods_xpath('mods:originInfo/mods:dateIssued') do |record, accumulator|
to_field 'dct_issued_s', mods_xpath('mods:originInfo/mods:dateIssued') do |_record, accumulator|
data = accumulator.flatten.map(&:text).slice(0..0)
accumulator.replace(data)
end

to_field 'dc_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |record, accumulator|
to_field 'dc_type_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:type') do |_record, accumulator|
data = accumulator.flatten.map(&:text).uniq.map { |v| v.split('#', 2).first }.slice(0..0)
accumulator.replace(data)
end
Expand All @@ -212,9 +237,11 @@ module Constants
accumulator << 'Image' if %w[image map book].include?(record.dor_content_type)
end

to_field 'dc_format_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:format') do |record, accumulator|
data = accumulator.flatten.map(&:text).select { |v| v =~ /format=/ }.map { |v| v.split('format=', 2).last }.slice(0..0)
if (data.present?)
to_field 'dc_format_s', mods_xpath('mods:extension[@displayLabel="geo"]//dc:format') do |_record, accumulator|
data = accumulator.flatten.map(&:text)
.select { |v| v =~ /format=/ }
.map { |v| v.split('format=', 2).last }.slice(0..0)
if data.present?
accumulator.replace(data.uniq.map { |v| GeoAuthorities.formats.fetch(v, v) })
else
accumulator.uniq!
Expand All @@ -230,10 +257,10 @@ module Constants
end

to_field 'dc_language_s', stanford_mods(:sw_language_facet), first_only
to_field 'dc_subject_sm', stanford_mods(:subject_other_search) do |record, accumulator|
accumulator.map! { |val| val.sub(/[\\,;]$/, '').strip if val }
to_field 'dc_subject_sm', stanford_mods(:subject_other_search) do |_record, accumulator|
accumulator.map! { |val| val&.sub(/[\\,;]$/, '')&.strip }
end
to_field 'dc_subject_sm', mods_xpath('mods:subject/mods:topic') do |record, accumulator|
to_field 'dc_subject_sm', mods_xpath('mods:subject/mods:topic') do |_record, accumulator|
accumulator.flatten!
accumulator.map! do |val|
if val.attr('authority') =~ /ISO19115topicCategory/i
Expand Down Expand Up @@ -264,50 +291,68 @@ module Constants

to_field 'layer_availability_score_f', literal(1.0)
to_field 'geoblacklight_version', literal('1.0')

# rubocop:disable Metrics/BlockLength
# rubocop:disable Layout/LineLength
to_field 'dct_references_s' do |record, accumulator, context|
references = {
'http://schema.org/url' => "https://purl.stanford.edu/#{record.druid}",
'http://www.loc.gov/mods/v3' => "https://purl.stanford.edu/#{record.druid}.mods",
'http://schema.org/url' => "#{settings['purl.url']}/#{record.druid}",
'http://www.loc.gov/mods/v3' => "#{settings['purl.url']}/#{record.druid}.mods"
}
case record.dor_content_type
when 'file'
references.merge!({
'https://oembed.com' => "https://purl.stanford.edu/embed.json?&hide_title=true&url=https://purl.stanford.edu/#{record.druid}",
})
references.merge!(
{
'https://oembed.com' => "#{settings['purl.url']}/embed.json?&hide_title=true&url=#{settings['purl.url']}/#{record.druid}"
}
)
when 'image', 'map', 'book'
references.merge!({
'https://oembed.com' => "https://purl.stanford.edu/embed.json?&hide_title=true&url=https://purl.stanford.edu/#{record.druid}",
'http://iiif.io/api/presentation#manifest' => "https://purl.stanford.edu/#{record.druid}/iiif/manifest"
})
references.merge!(
{
'https://oembed.com' => "#{settings['purl.url']}/embed.json?&hide_title=true&url=#{settings['purl.url']}/#{record.druid}",
'http://iiif.io/api/presentation#manifest' => "#{settings['purl.url']}/#{record.druid}/iiif/manifest"
}
)
when 'geo'
formats = context.output_hash['dc_format_s'] || []

references.merge!({
'http://schema.org/downloadUrl' => "https://stacks.stanford.edu/file/druid:#{record.druid}/data.zip",
'http://www.opengis.net/def/serviceType/ogc/wms' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wms",
'http://www.isotc211.org/schemas/2005/gmd/' => "https://raw.githubusercontent.com/OpenGeoMetadata/edu.stanford.purl/master/#{record.druid_tree}/iso19139.xml",
})
references.merge!(
{
'http://schema.org/downloadUrl' => "#{settings['stacks.url']}/file/druid:#{record.druid}/data.zip",
'http://www.opengis.net/def/serviceType/ogc/wms' => "#{geoserver_url(record)}/wms",
'http://www.isotc211.org/schemas/2005/gmd/' => "https://raw.githubusercontent.com/OpenGeoMetadata/edu.stanford.purl/master/#{record.druid_tree}/iso19139.xml"
}
)

if formats.include?('Shapefile')
references.merge!({
'http://www.opengis.net/def/serviceType/ogc/wfs' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wfs",
})
references.merge!(
{
'http://www.opengis.net/def/serviceType/ogc/wfs' => "#{geoserver_url(record)}/wfs"
}
)
elsif formats.include?('GeoTIFF') || formats.include?('ArcGRID')
references.merge!({
'http://www.opengis.net/def/serviceType/ogc/wcs' => "#{Constants::GEOWEBSERVICES[context.output_hash['dc_rights_s'][0].to_sym]}/wcs",
})
references.merge!(
{
'http://www.opengis.net/def/serviceType/ogc/wcs' => "#{geoserver_url(record)}/wcs"
}
)
end

index_map = record.public_xml_doc.xpath('//file[@id="index_map.json"]').length > 0
index_map = !record.public_xml_doc.xpath('//file[@id="index_map.json"]').empty?

if index_map
references.merge!({
'https://openindexmaps.org' => "https://stacks.stanford.edu/file/druid:#{record.druid}/index_map.json"
})
references.merge!(
{
'https://openindexmaps.org' => "#{settings['stacks.url']}/file/druid:#{record.druid}/index_map.json"
}
)
end
end
accumulator << references.to_json
end
# rubocop:enable Metrics/BlockLength
# rubocop:enable Layout/LineLength

to_field 'solr_geom', stanford_mods(:geo_extensions_as_envelope)
to_field 'solr_geom', stanford_mods(:coordinates_as_envelope)
to_field 'layer_slug_s' do |record, accumulator|
Expand Down Expand Up @@ -373,24 +418,25 @@ module Constants

to_field 'dc_source_sm' do |record, accumulator|
next unless record.dor_content_type == 'geo'
next unless record.collections && record.collections.any?
next unless record.collections&.any?

record.collections.uniq.each do |collection|
accumulator << "stanford-#{collection.druid}"
end
end

to_field 'dct_isPartOf_sm', mods_xpath('mods:relatedItem[@type="host"]/mods:titleInfo/mods:title') do |record, accumulator|
to_field 'dct_isPartOf_sm',
mods_xpath('mods:relatedItem[@type="host"]/mods:titleInfo/mods:title') do |_record, accumulator|
accumulator.flatten!
accumulator.map!(&:text)
accumulator.uniq!
end

each_record do |record, context|
each_record do |record, _context|
$druid_title_cache[record.druid] = record.label if record.is_collection
end

each_record do |record, context|
each_record do |_record, context|
context.output_hash.select { |k, _v| k =~ /_struct$/ }.each do |k, v|
context.output_hash[k] = Array(v).map { |x| JSON.generate(x) }
end
Expand All @@ -408,24 +454,23 @@ module Constants
end
end

each_record do |record, context|
each_record do |_record, context|
t0 = context.clipboard[:benchmark_start_time]
t1 = Time.now

logger.debug('geo_config.rb') { "Processed #{context.output_hash['id']} (#{t1 - t0}s)" }
end


# rubocop:disable Metrics/MethodLength
def trim_punctuation_when_preceded_by_two_word_characters_or_some_other_stuff(str)
previous_str = nil
until str == previous_str
previous_str = str

str = str.strip.gsub(/ *([,\/;:])$/, '')
.sub(/(\w\w)\.$/, '\1')
.sub(/(\p{L}\p{L})\.$/, '\1')
.sub(/(\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?)\.$/, '\1')

str = str.strip.gsub(%r{ *([,/;:])$}, '')
.sub(/(\w\w)\.$/, '\1')
.sub(/(\p{L}\p{L})\.$/u, '\1')
.sub(/(\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?)\.$/u, '\1')

# single square bracket characters if they are the start and/or end
# chars and there are no internal square brackets.
Expand All @@ -438,3 +483,7 @@ def trim_punctuation_when_preceded_by_two_word_characters_or_some_other_stuff(st

str
end
# rubocop:enable Metrics/MethodLength

# rubocop:enable Style/GlobalVars
# rubocop:enable Style/CombinableLoops
8 changes: 4 additions & 4 deletions spec/integration/geo_config_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def stub_mods_request(druid, body)
stub_purl_request(druid, File.read(file_fixture("#{druid}.xml").to_s))
end
it 'maps things to the right places' do
expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/dc482zx1528'],
expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/dc482zx1528'],
'dc_title_s' => ['Jōshū Kusatsu Onsenzu'],
'dc_rights_s' => ['Public'],
'layer_geom_type_s' => ['Image'],
Expand Down Expand Up @@ -110,7 +110,7 @@ def stub_mods_request(druid, body)
stub_purl_request(druid, File.read(file_fixture("#{druid}.xml").to_s))
end
it 'maps the metadata' do
expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/vv853br8653'],
expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/vv853br8653'],
'dc_title_s' => ['Abundance Estimates of the Pacific Salmon Conservation Assessment Database, 1978-2008'],
'dct_provenance_s' => ['Stanford'],
'layer_geom_type_s' => ['Polygon'],
Expand Down Expand Up @@ -231,7 +231,7 @@ def stub_mods_request(druid, body)
end

it 'has expected fields' do
expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/bq589tv8583'],
expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/bq589tv8583'],
'layer_geom_type_s' => ['Collection']
end
it 'does not include a layer_id_s' do
Expand All @@ -246,7 +246,7 @@ def stub_mods_request(druid, body)
end

it 'has expected fields' do
expect(result).to include 'dc_identifier_s' => ['http://purl.stanford.edu/pq479rm6462'],
expect(result).to include 'dc_identifier_s' => ['https://purl.stanford.edu/pq479rm6462'],
'dc_format_s' => ['Geodatabase'],
'layer_geom_type_s' => ['Mixed']
end
Expand Down

0 comments on commit b2175d2

Please sign in to comment.