Skip to content

Commit

Permalink
HYC-1391: List unmappable affiliations & Test HycIndexer (#765)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxkadel authored Mar 14, 2022
1 parent 42d336c commit eabcd74
Show file tree
Hide file tree
Showing 10 changed files with 642 additions and 176 deletions.
7 changes: 7 additions & 0 deletions app/jobs/list_unmappable_affiliations_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class ListUnmappableAffiliationsJob < Hyrax::ApplicationJob
queue_as :long_running_jobs

def perform
HycCrawlerService.create_csv_of_umappable_affiliations
end
end
2 changes: 1 addition & 1 deletion app/services/departments_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def self.identifier(term)
def self.label(id)
authority.find(id).fetch('term')
rescue StandardError
Rails.logger.debug "DepartmentsService: cannot find '#{id}'"
Rails.logger.warn "DepartmentsService: cannot find '#{id}'"
puts "DepartmentsService: cannot find '#{id}'" # for migration log
nil
end
Expand Down
74 changes: 74 additions & 0 deletions app/services/hyc_crawler_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Searches Solr in chunks for affiliations
module HycCrawlerService
def self.person_fields
[:advisors, :arrangers, :composers, :contributors, :creators, :project_directors,
:researchers, :reviewers, :translators]
end

def self.person_classes
[Article, Artwork, DataSet, Dissertation, General, HonorsThesis, Journal, MastersPaper]
end

def self.crawl_for_affiliations(&block)
Rails.logger.info('Beginning to search for affiliations')
person_classes.each do |klass|
search_by_class(klass, &block)
end
end

def self.search_by_class(klass)
# search_in_batches returns RSolr::Response::PaginatedDocSet, each object in group is a hash of a solr response
klass.search_in_batches('person_label_tesim:*') do |group|
Rails.logger.info("Finding affiliations for group of #{klass} with ids: #{group.map { |solr_doc| solr_doc['id'] }}")
group.map do |solr_doc|
object = klass.find(solr_doc['id'])

url = Rails.application.routes.url_helpers.url_for(object)
# Sort so that we can test csv line more easily
affiliations = all_person_affiliations(object).sort
yield(solr_doc['id'], url, affiliations) unless affiliations.empty?
end
end
end

def self.csv_file_path
csv_directory = Rails.root.join(ENV['DATA_STORAGE'], 'reports')
FileUtils.mkdir_p(csv_directory)
Rails.root.join(ENV['DATA_STORAGE'], 'reports', 'umappable_affiliations.csv')
end

def self.csv_headers
['object_id', 'url', 'affiliations']
end

def self.create_csv_of_umappable_affiliations
CSV.open(csv_file_path, 'a+') do |csv|
csv << csv_headers
crawl_for_affiliations do |document_id, url, affiliations|
unmappable_affiliations = unmappable_affiliations(affiliations)
Rails.logger.debug("Saving object info to csv. url: #{url}") unless unmappable_affiliations.empty?
csv << [document_id, url, unmappable_affiliations] unless unmappable_affiliations.empty?
end
end
end

def self.person_affiliations_by_type(object, person_type)
people_object = object.try(person_type)
return unless people_object && !people_object.empty?

affiliations = people_object.map { |person| person.attributes['affiliation'].to_a }
# Remove nils and empty strings
affiliations.flatten.reject { |e| e.to_s.empty? }
end

def self.unmappable_affiliations(affiliations)
affiliations.map { |affil| DepartmentsService.label(affil) ? nil : affil }.compact
end

def self.all_person_affiliations(object)
person_fields.map do |field|
affiliations = person_affiliations_by_type(object, field)
affiliations unless affiliations.nil? || affiliations.compact.empty?
end.compact.flatten
end
end
2 changes: 2 additions & 0 deletions app/services/tasks/proquest_ingest_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ def proquest_metadata(metadata_file)
end

department = metadata.xpath('//DISS_description/DISS_institution/DISS_inst_contact').text.strip
# TODO: Currently, if the parsed department can't be mapped, the affiliation is just saved as whatever was parsed from the XML
# These affiliations then cannot be indexed to Solr or displayed, since they don't align with the controlled vocabulary
affiliation = ProquestDepartmentMappingsService.standard_department_name(department) || department

date_issued = metadata.xpath('//DISS_description/DISS_dates/DISS_comp_date').text
Expand Down
4 changes: 4 additions & 0 deletions lib/tasks/list_affiliations.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
desc 'Create a csv that affiliations that do not map to solr and their associated object ids'
task list_affiliations: :environment do
ListUnmappableAffiliationsJob.perform_later
end
Loading

0 comments on commit eabcd74

Please sign in to comment.