Skip to content

Commit

Permalink
feat(logic): added strip_extra_white field;
Browse files Browse the repository at this point in the history
- Added `strip_extra_white` field and form fields.
- Used `strip_extra_white` to control stripping white space.
  • Loading branch information
JVickery-TBS committed May 14, 2024
1 parent 54f87e0 commit 50080ea
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 36 deletions.
26 changes: 17 additions & 9 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = {}
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict((f['id'], f['info'])
for f in existing_fields
if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Column types are either set (overridden) in the Data Dictionary page
# or default to text type (which is robust)
Expand All @@ -195,6 +198,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
for f in fields:
if f['id'] in existing_info:
f['info'] = existing_info[f['id']]
f['strip_extra_white'] = existing_fields_by_headers[f['id']].get('strip_extra_white', True)

'''
Delete or truncate existing datastore table before proceeding,
Expand All @@ -211,7 +215,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
else:
fields = [
{'id': header_name,
'type': 'text',}
'type': 'text',
'strip_extra_white': True,}
for header_name in headers]

logger.info('Fields: %s', fields)
Expand All @@ -225,7 +230,7 @@ def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
Expand All @@ -238,7 +243,7 @@ def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
Expand Down Expand Up @@ -388,10 +393,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = None
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict(
(f['id'], f['info'])
for f in existing_fields if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
Expand All @@ -403,7 +411,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):

TYPES, TYPE_MAPPING = get_types()
types = type_guess(stream.sample[1:], types=TYPES, strict=True)
info = []
fields = []

# override with types user requested
if existing_info:
Expand All @@ -415,11 +423,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
info.append(existing_info.get(h, {}))

fields.append(existing_fields_by_headers.get(h, {}))

headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types, info=info)
type_converter = TypeConverter(types=types, fields=fields)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand All @@ -440,6 +447,7 @@ def row_iterator():
for h in headers_dicts:
if h['id'] in existing_info:
h['info'] = existing_info[h['id']]
h['strip_extra_white'] = existing_fields_by_headers[h['id']].get('strip_extra_white', True)
# create columns with types user requested
type_override = existing_info[h['id']].get('type_override')
if type_override in list(_TYPE_MAPPING.values()):
Expand Down
8 changes: 4 additions & 4 deletions ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None, info=None):
def __init__(self, types=None, fields=None):
self.types = types
self.info = info
self.fields = fields

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -32,9 +32,9 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if self.info:
if self.fields:
# only strip white space if strip_extra_white is True
if self.info[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str):
if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
Expand Down
19 changes: 10 additions & 9 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ckan.model.resource import Resource
from ckan.model.package import Package

from . import action, auth, helpers as xloader_helpers, utils, validators
from . import action, auth, helpers as xloader_helpers, utils
from ckanext.xloader.utils import XLoaderFormats

try:
Expand All @@ -35,7 +35,6 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
plugins.implements(plugins.IValidators)
plugins.implements(IDataDictionaryForm, inherit=True)

# IClick
Expand Down Expand Up @@ -210,18 +209,20 @@ def get_helpers(self):
"is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
}

# IValidators

def get_validators(self):
return {'xloader_datastore_fields_validator': validators.datastore_fields_validator}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
info_validator = toolkit.get_validator('xloader_datastore_fields_validator')
schema['fields']['info'] = [info_validator] + schema['fields']['info']
default = toolkit.get_validator('default')
boolean_validator = toolkit.get_validator('boolean_validator')
to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
return schema

def update_datastore_info_field(self, field, plugin_data):
# expose all our non-secret plugin data in the field
field.update(plugin_data.get('xloader', {}))
return field


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

{% block additional_fields %}
{{ super() }}
{{ form.select('info__' ~ position ~ '__strip_extra_white',
{{ form.select('fields__' ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': 'Yes', 'value': true},
{'text': 'No', 'value': false},
], selected=field.get('info', {}).get('strip_extra_white')) }}
], selected=field.get('strip_extra_white')) }}
{% endblock %}
12 changes: 0 additions & 12 deletions ckanext/xloader/validators.py

This file was deleted.

0 comments on commit 50080ea

Please sign in to comment.