Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip out all special characters from csv headers #915

Closed
wants to merge 11 commits into from
72 changes: 72 additions & 0 deletions app/assets/javascripts/bulkrax/datatables.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
Blacklight.onLoad(function() {
if($('#importer-show-table').length) {
$('#importer-show-table').DataTable( {
'processing': true,
'serverSide': true,
"ajax": window.location.href.replace(/(\/importers\/\d+)/, "$1/entry_table.json"),
"pageLength": 30,
"lengthMenu": [[30, 100, 200], [30, 100, 200]],
"columns": [
{ "data": "identifier" },
{ "data": "id" },
{ "data": "status_message" },
{ "data": "type" },
{ "data": "updated_at" },
{ "data": "errors", "orderable": false },
{ "data": "actions", "orderable": false }
],
"dom": '<"toolbar">frtip',
initComplete: function () {
// Add entry class filter
let entrySelect = document.createElement('select')
entrySelect.id = 'entry-filter'
entrySelect.classList.value = 'form-control input-sm'
entrySelect.style.marginRight = '10px'

entrySelect.add(new Option('Filter by Entry Class', ''))
// Read the options from the footer and add them to the entrySelect
$('#importer-entry-classes').text().split('|').forEach(function (col, i) {
entrySelect.add(new Option(col.trim()))
})
document.querySelector('div#importer-show-table_filter').firstChild.prepend(entrySelect)

// Apply listener for user change in value
entrySelect.addEventListener('change', function () {
var val = entrySelect.value;
this.api()
.search(val ? val : '', false, false)
.draw();
}.bind(this));

// Add status filter
let statusSelect = document.createElement('select');
statusSelect.id = 'status-filter'
statusSelect.classList.value = 'form-control input-sm'
statusSelect.style.marginRight = '10px'

statusSelect.add(new Option('Filter by Status', ''));
statusSelect.add(new Option('Complete'))
statusSelect.add(new Option('Pending'))
statusSelect.add(new Option('Failed'))
document.querySelector('div#importer-show-table_filter').firstChild.prepend(statusSelect)

// Apply listener for user change in value
statusSelect.addEventListener('change', function () {
var val = statusSelect.value;
this.api()
.search(val ? val : '', false, false)
.draw();
}.bind(this));

// Add refresh link
let refreshLink = document.createElement('a');
refreshLink.onclick = function() {
this.api().ajax.reload(null, false)
}.bind(this)
refreshLink.classList.value = 'glyphicon glyphicon-refresh'
refreshLink.style.marginLeft = '10px'
document.querySelector('div#importer-show-table_filter').firstChild.append(refreshLink)
}
} );
}
})
17 changes: 12 additions & 5 deletions app/controllers/bulkrax/importers_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@ class ImportersController < ::Bulkrax::ApplicationController
include Hyrax::ThemedLayoutController if defined?(::Hyrax)
include Bulkrax::DownloadBehavior
include Bulkrax::API
include Bulkrax::DatatablesBehavior
include Bulkrax::ValidationHelper

protect_from_forgery unless: -> { api_request? }
before_action :token_authenticate!, if: -> { api_request? }, only: [:create, :update, :delete]
before_action :authenticate_user!, unless: -> { api_request? }
before_action :check_permissions
before_action :set_importer, only: [:show, :edit, :update, :destroy]
before_action :set_importer, only: [:show, :entry_table, :edit, :update, :destroy]
with_themed_layout 'dashboard' if defined?(::Hyrax)

# GET /importers
Expand All @@ -34,9 +35,15 @@ def show
add_importer_breadcrumbs
add_breadcrumb @importer.name
end
@work_entries = @importer.entries.where(type: @importer.parser.entry_class.to_s).page(params[:work_entries_page]).per(30)
@collection_entries = @importer.entries.where(type: @importer.parser.collection_entry_class.to_s).page(params[:collections_entries_page]).per(30)
@file_set_entries = @importer.entries.where(type: @importer.parser.file_set_entry_class.to_s).page(params[:file_set_entries_page]).per(30)
@first_entry = @importer.entries.first
end

def entry_table
@entries = @importer.entries.order(table_order).page(table_page).per(table_per_page)
@entries = @entries.where(table_search) if table_search.present?
respond_to do |format|
format.json { render json: format_entries(@entries, @importer) }
end
end

# GET /importers/new
Expand Down Expand Up @@ -210,7 +217,7 @@ def files_for_import(file, cloud_files)

# Use callbacks to share common setup or constraints between actions.
def set_importer
@importer = Importer.find(params[:id])
@importer = Importer.find(params[:id] || params[:importer_id])
end

def importable_params
Expand Down
72 changes: 72 additions & 0 deletions app/controllers/concerns/bulkrax/datatables_behavior.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# frozen_string_literal: true

module Bulkrax
module DatatablesBehavior
extend ActiveSupport::Concern

def table_per_page
per_page = params[:length].to_i
per_page < 1 ? 30 : per_page
end

def order_value(column)
params['columns']&.[](column)&.[]('data')
end

def table_order
"#{order_value(params['order']['0']['column'])} #{params['order']['0']['dir']}" if params['order']['0']['column'].present?
end

# convert offset to page number
def table_page
params[:start].blank? ? 1 : (params[:start].to_i / params[:length].to_i) + 1
end

def table_search
return @table_search if @table_search
return @table_search = false if params['search']&.[]('value').blank?

table_search_value = params['search']&.[]('value')
@table_search = ['identifier', 'id', 'status_message', 'type', 'updated_at'].map do |col|
"cast(bulkrax_entries.#{col} as varchar(255)) ILIKE '%#{table_search_value}%'"
end.join(' OR ')
end

def format_entries(entries, item)
result = entries.map do |e|
{
identifier: view_context.link_to(e.identifier, view_context.item_entry_path(item, e)),
id: e.id,
status_message: entry_status(e),
type: e.type,
updated_at: e.updated_at,
errors: e.latest_status&.error_class&.present? ? view_context.link_to(e.latest_status.error_class, view_context.item_entry_path(item, e), title: e.latest_status.error_message) : "",
actions: util_links(e, item)
}
end
{
data: result,
recordsTotal: item.entries.size,
recordsFiltered: item.entries.size
}
end

def util_links(e, item)
links = []
links << view_context.link_to(view_context.raw('<span class="glyphicon glyphicon-info-sign"></span>'), view_context.item_entry_path(item, e))
links << "<a class='glyphicon glyphicon-repeat' data-toggle='modal' data-target='#bulkraxItemModal' data-entry-id='#{e.id}'></a>" if view_context.an_importer?(item)
links << view_context.link_to(view_context.raw('<span class="glyphicon glyphicon-trash"></span>'), view_context.item_entry_path(item, e), method: :delete, data: { confirm: 'This will delete the entry and any work associated with it. Are you sure?' })
links.join(" ")
end

def entry_status(e)
if e.status_message == "Complete"
"<td><span class='glyphicon glyphicon-ok' style='color: green;'></span> #{e.status_message}</td>"
elsif e.status_message == "Pending"
"<td><span class='glyphicon glyphicon-option-horizontal' style='color: blue;'></span> #{e.status_message}</td>"
else
"<td><span class='glyphicon glyphicon-remove' style='color: #{e.status == 'Deleted' ? 'green' : 'red'};'></span> #{e.status_message}</td>"
end
end
end
end
6 changes: 3 additions & 3 deletions app/jobs/bulkrax/importer_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ class ImporterJob < ApplicationJob

def perform(importer_id, only_updates_since_last_import = false)
importer = Importer.find(importer_id)
importer.only_updates = only_updates_since_last_import || false

importer.current_run
unzip_imported_file(importer.parser)
import(importer, only_updates_since_last_import)
import(importer)
update_current_run_counters(importer)
schedule(importer) if importer.schedulable?
rescue ::CSV::MalformedCSVError => e
importer.set_status_info(e)
end

def import(importer, only_updates_since_last_import)
importer.only_updates = only_updates_since_last_import || false
def import(importer)
return unless importer.valid_import?

importer.import_objects
Expand Down
3 changes: 2 additions & 1 deletion app/models/bulkrax/csv_entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ def self.fields_from_data(data)
class_attribute(:csv_read_data_options, default: {})

# there's a risk that this reads the whole file into memory and could cause a memory leak
# we strip any special characters out of the headers. looking at you Excel
def self.read_data(path)
raise StandardError, 'CSV path empty' if path.blank?
options = {
headers: true,
header_converters: ->(h) { h.to_s.strip.to_sym },
header_converters: ->(h) { h.to_s.gsub(/[^\w\d -]+/, '').strip.to_sym },
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want underscore as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremyf \w includes underscore.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Today I (re-)learned!

encoding: 'utf-8'
}.merge(csv_read_data_options)

Expand Down
12 changes: 4 additions & 8 deletions app/models/bulkrax/importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,16 @@ def last_run
@last_run ||= self.importer_runs.last
end

def failed_entries?
entries.failed.any?
end

def failed_statuses
@failed_statuses ||= Bulkrax::Status.latest_by_statusable
.includes(:statusable)
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', self.entries.pluck(:id), 'Bulkrax::Entry', 'Failed')
end

def failed_entries
@failed_entries ||= failed_statuses.map(&:statusable)
end

def failed_messages
failed_statuses.each_with_object({}) do |e, i|
i[e.error_message] ||= []
Expand All @@ -146,10 +146,6 @@ def completed_statuses
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', self.entries.pluck(:id), 'Bulkrax::Entry', 'Complete')
end

def completed_entries
@completed_entries ||= completed_statuses.map(&:statusable)
end

def seen
@seen ||= {}
end
Expand Down
6 changes: 5 additions & 1 deletion app/models/bulkrax/status.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module Bulkrax
class Status < ApplicationRecord
belongs_to :statusable, polymorphic: true
belongs_to :statusable, polymorphic: true, denormalize: { fields: %i[status_message], if: :latest? }
belongs_to :runnable, polymorphic: true
serialize :error_backtrace, Array

Expand All @@ -21,5 +21,9 @@ def self.latest_by_statusable_subtable
status_table.join(latest_status_query.as(latest_status_table.name.to_s), Arel::Nodes::InnerJoin)
.on(status_table[:id].eq(latest_status_table[:latest_status_id]))
end

def latest?
self.id == self.class.where(statusable_id: self.statusable_id, statusable_type: self.statusable_type).order('id desc').pick(:id)
end
end
end
40 changes: 12 additions & 28 deletions app/models/concerns/bulkrax/errored_entries.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,26 @@ module Bulkrax
module ErroredEntries
extend ActiveSupport::Concern

def errored_entries
@errored_entries ||= importerexporter.entries.failed
end

def write_errored_entries_file
if @errored_entries.blank?
entry_ids = importerexporter.entries.pluck(:id)
error_statuses = Bulkrax::Status.latest_by_statusable
.includes(:statusable)
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Failed')
@errored_entries = error_statuses.map(&:statusable)
end
return if @errored_entries.blank?
return if errored_entries.blank?

file = setup_errored_entries_file
headers = import_fields
file.puts(headers.to_csv)
@errored_entries.each do |ee|
row = build_errored_entry_row(headers, ee)
file.puts(row)
setup_errored_entries_file do |csv|
errored_entries.find_each do |ee|
csv << ee.raw_metadata
end
end
file.close
true
end

def build_errored_entry_row(headers, errored_entry)
row = {}
# Ensure each header has a value, even if it's just an empty string
headers.each do |h|
row.merge!("#{h}": nil)
end
# Match each value to its corresponding header
row.merge!(errored_entry.raw_metadata.symbolize_keys)

row.values.to_csv
end

def setup_errored_entries_file
FileUtils.mkdir_p(File.dirname(importerexporter.errored_entries_csv_path))
File.open(importerexporter.errored_entries_csv_path, 'w')
CSV.open(importerexporter.errored_entries_csv_path, 'wb', headers: import_fields.map(&:to_s), write_headers: true) do |csv|
yield csv
end
end
end
end
3 changes: 3 additions & 0 deletions app/models/concerns/bulkrax/status_info.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ module StatusInfo
as: :statusable,
class_name: "Bulkrax::Status",
inverse_of: :statusable
scope :failed, -> { where(status_message: 'Failed') }
scope :complete, -> { where(status_message: 'Complete') }
scope :pending, -> { where(status_message: 'Pending') }
end

def current_status
Expand Down
22 changes: 18 additions & 4 deletions app/parsers/bulkrax/csv_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@ def self.export_supported?
true
end

def file_for_import
@file_for_import ||= only_updates ? parser_fields['partial_import_file_path'] : import_file_path
end

def csv_data
@csv_data ||= entry_class.read_data(file_for_import)
end

def records(_opts = {})
return @records if @records.present?

file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
# data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
csv_data = entry_class.read_data(file_for_import)
importer.parser_fields['total'] = csv_data.count
importer.save

Expand Down Expand Up @@ -80,9 +86,17 @@ def file_sets_total
file_sets.size
end

# We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
# Prefer the csv data if it is already loaded
# If the csv data is not loaded and there are no records yet, we need to load the csv data (which is memoized)
# If the csv data is not loaded and there are records, then reading the records form memory is faster than rereading the csv
def import_fields
@import_fields ||= records.inject(:merge).keys.compact.uniq
return @import_fields if @import_fields.present?
# this has to be the instance variable because calling the method will load it no matter what
@import_fields = if @csv_data.present? || @records.blank?
csv_data.headers
else
records.inject(:merge).keys.compact.uniq
end
end

def required_elements?(record)
Expand Down
Loading
Loading