Skip to content

Commit

Permalink
feat(logic): added strip_extra_white info field;
Browse files Browse the repository at this point in the history
- Added `strip_extra_white` info field and form fields.
- Added validator for `strip_extra_white`.
- Used `strip_extra_white` to control stripping white space.
  • Loading branch information
JVickery-TBS committed May 14, 2024
1 parent a6ab0a0 commit 54f87e0
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 42 deletions.
64 changes: 35 additions & 29 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,33 +170,6 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
if isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
if isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()

Expand Down Expand Up @@ -238,11 +211,40 @@ def strip_white_space_iter():
else:
fields = [
{'id': header_name,
'type': 'text'}
'type': 'text',}
for header_name in headers]

logger.info('Fields: %s', fields)

save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# Create table
from ckan import model
context = {'model': model, 'ignore_auth': True}
Expand Down Expand Up @@ -401,6 +403,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):

TYPES, TYPE_MAPPING = get_types()
types = type_guess(stream.sample[1:], types=TYPES, strict=True)
info = []

# override with types user requested
if existing_info:
Expand All @@ -411,9 +414,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
'timestamp': datetime.datetime,
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
info.append(existing_info.get(h, {}))


headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
type_converter = TypeConverter(types=types, info=info)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand Down
13 changes: 7 additions & 6 deletions ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None):
def __init__(self, types=None, info=None):
self.types = types
self.info = info

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -31,11 +32,11 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if isinstance(cell_value, str):
# strip white space around cell values
#TODO: condition behind DataDictionary option??
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if self.info:
# only strip white space if strip_extra_white is True
if self.info[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
continue
cell_type = self.types[cell_index] if self.types else None
Expand Down
17 changes: 16 additions & 1 deletion ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

from ckan import plugins
from ckan.plugins import toolkit
from ckanext.datastore.interfaces import IDataDictionaryForm

from ckan.model.domain_object import DomainObjectOperation
from ckan.model.resource import Resource
from ckan.model.package import Package

from . import action, auth, helpers as xloader_helpers, utils
from . import action, auth, helpers as xloader_helpers, utils, validators
from ckanext.xloader.utils import XLoaderFormats

try:
Expand All @@ -34,6 +35,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
plugins.implements(plugins.IValidators)
plugins.implements(IDataDictionaryForm, inherit=True)

# IClick
def get_commands(self):
Expand Down Expand Up @@ -207,6 +210,18 @@ def get_helpers(self):
"is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
}

# IValidators

def get_validators(self):
return {'xloader_datastore_fields_validator': validators.datastore_fields_validator}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
info_validator = toolkit.get_validator('xloader_datastore_fields_validator')
schema['fields']['info'] = [info_validator] + schema['fields']['info']
return schema


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
11 changes: 11 additions & 0 deletions ckanext/xloader/templates/datastore/snippets/dictionary_form.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{% ckan_extends %}
{% import 'macros/form.html' as form %}

{% block additional_fields %}
{{ super() }}
{{ form.select('info__' ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': 'Yes', 'value': true},
{'text': 'No', 'value': false},
], selected=field.get('info', {}).get('strip_extra_white')) }}
{% endblock %}
12 changes: 6 additions & 6 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
u"Street Light Outages",
u"Public Works Department",
u"Street Lights",
Expand Down Expand Up @@ -259,14 +259,14 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces
None,
u"522 Saratoga St East Boston MA 02128",
u"1",
Expand All @@ -291,14 +291,14 @@ def test_boston_311(self, Session):
None,
u"ONTIME",
u"Open",
u" ",
None, # " " transforms to None
u"Graffiti Removal",
u"Property Management",
u"Graffiti",
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces
None,
u"965 Bennington St East Boston MA 02128",
u"1",
Expand Down Expand Up @@ -1088,7 +1088,7 @@ def test_boston_311(self, Session):
u"Graffiti Removal",
u"PROP_GRAF_GraffitiRemoval",
u"PROP",
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces
u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces
u"",
u"522 Saratoga St East Boston MA 02128",
Decimal("1"),
Expand Down
12 changes: 12 additions & 0 deletions ckanext/xloader/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from ckan.plugins.toolkit import asbool


def datastore_fields_validator(value, context):
if 'strip_extra_white' not in value:
# default to True
value['strip_extra_white'] = True

# bool value for strip_extra_white
value['strip_extra_white'] = asbool(value['strip_extra_white'])

return value

0 comments on commit 54f87e0

Please sign in to comment.