feat(logic): added strip_extra_white info field;

- Added `strip_extra_white` info field and form fields. - Added validator for `strip_extra_white`. - Used `strip_extra_white` to control stripping white space.
ckan · May 14, 2024 · 54f87e0 · 54f87e0
1 parent a6ab0a0
commit 54f87e0
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 42 deletions.
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -170,33 +170,6 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('Ensuring character coding is UTF8')
     f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
     try:
-        save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
-        try:
-            with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
-                                       skip_rows=skip_rows) as stream:
-                super_iter = stream.iter
-                def strip_white_space_iter():
-                    for row in super_iter():
-                        for _index, _cell in enumerate(row):
-                            if isinstance(_cell, str):
-                                row[_index] = _cell.strip()
-                        yield row
-                stream.iter = strip_white_space_iter
-                stream.save(**save_args)
-        except (EncodingError, UnicodeDecodeError):
-            with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
-                        skip_rows=skip_rows) as stream:
-                super_iter = stream.iter
-                def strip_white_space_iter():
-                    for row in super_iter():
-                        for _index, _cell in enumerate(row):
-                            if isinstance(_cell, str):
-                                row[_index] = _cell.strip()
-                        yield row
-                stream.iter = strip_white_space_iter
-                stream.save(**save_args)
-        csv_filepath = f_write.name
-
         # datastore db connection
         engine = get_write_engine()
 
@@ -238,11 +211,40 @@ def strip_white_space_iter():
         else:
             fields = [
                 {'id': header_name,
-                 'type': 'text'}
+                 'type': 'text',}
                 for header_name in headers]
 
         logger.info('Fields: %s', fields)
 
+        save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
+        try:
+            with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
+                                       skip_rows=skip_rows) as stream:
+                super_iter = stream.iter
+                def strip_white_space_iter():
+                    for row in super_iter():
+                        for _index, _cell in enumerate(row):
+                            # only strip white space if strip_extra_white is True
+                            if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
+                                row[_index] = _cell.strip()
+                        yield row
+                stream.iter = strip_white_space_iter
+                stream.save(**save_args)
+        except (EncodingError, UnicodeDecodeError):
+            with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
+                        skip_rows=skip_rows) as stream:
+                super_iter = stream.iter
+                def strip_white_space_iter():
+                    for row in super_iter():
+                        for _index, _cell in enumerate(row):
+                            # only strip white space if strip_extra_white is True
+                            if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str):
+                                row[_index] = _cell.strip()
+                        yield row
+                stream.iter = strip_white_space_iter
+                stream.save(**save_args)
+        csv_filepath = f_write.name
+
         # Create table
         from ckan import model
         context = {'model': model, 'ignore_auth': True}
@@ -401,6 +403,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
 
     TYPES, TYPE_MAPPING = get_types()
     types = type_guess(stream.sample[1:], types=TYPES, strict=True)
+    info = []
 
     # override with types user requested
     if existing_info:
@@ -411,9 +414,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
                 'timestamp': datetime.datetime,
             }.get(existing_info.get(h, {}).get('type_override'), t)
             for t, h in zip(types, headers)]
+        for h in headers:
+            info.append(existing_info.get(h, {}))
+
 
     headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
-    type_converter = TypeConverter(types=types)
+    type_converter = TypeConverter(types=types, info=info)
 
     with UnknownEncodingStream(table_filepath, file_format, decoding_result,
                                skip_rows=skip_rows,

diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
@@ -18,8 +18,9 @@ class TypeConverter:
     as desired.
     """
 
-    def __init__(self, types=None):
+    def __init__(self, types=None, info=None):
         self.types = types
+        self.info = info
 
     def convert_types(self, extended_rows):
         """ Try converting cells to numbers or timestamps if applicable.
@@ -31,11 +32,11 @@ def convert_types(self, extended_rows):
             for cell_index, cell_value in enumerate(row):
                 if cell_value is None:
                     row[cell_index] = ''
-                if isinstance(cell_value, str):
-                    # strip white space around cell values
-                    #TODO: condition behind DataDictionary option??
-                    cell_value = cell_value.strip()
-                    row[cell_index] = cell_value.strip()
+                if self.info:
+                    # only strip white space if strip_extra_white is True
+                    if self.info[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str):
+                        cell_value = cell_value.strip()
+                        row[cell_index] = cell_value.strip()
                 if not cell_value:
                     continue
                 cell_type = self.types[cell_index] if self.types else None

diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
@@ -4,12 +4,13 @@
 
 from ckan import plugins
 from ckan.plugins import toolkit
+from ckanext.datastore.interfaces import IDataDictionaryForm
 
 from ckan.model.domain_object import DomainObjectOperation
 from ckan.model.resource import Resource
 from ckan.model.package import Package
 
-from . import action, auth, helpers as xloader_helpers, utils
+from . import action, auth, helpers as xloader_helpers, utils, validators
 from ckanext.xloader.utils import XLoaderFormats
 
 try:
@@ -34,6 +35,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
     plugins.implements(plugins.IResourceController, inherit=True)
     plugins.implements(plugins.IClick)
     plugins.implements(plugins.IBlueprint)
+    plugins.implements(plugins.IValidators)
+    plugins.implements(IDataDictionaryForm, inherit=True)
 
     # IClick
     def get_commands(self):
@@ -207,6 +210,18 @@ def get_helpers(self):
             "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
         }
 
+    # IValidators
+
+    def get_validators(self):
+        return {'xloader_datastore_fields_validator': validators.datastore_fields_validator}
+
+    # IDataDictionaryForm
+
+    def update_datastore_create_schema(self, schema):
+        info_validator = toolkit.get_validator('xloader_datastore_fields_validator')
+        schema['fields']['info'] = [info_validator] + schema['fields']['info']
+        return schema
+
 
 def _should_remove_unsupported_resource_from_datastore(res_dict):
     if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):

diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html
@@ -0,0 +1,11 @@
+{% ckan_extends %}
+{% import 'macros/form.html' as form %}
+
+{% block additional_fields %}
+  {{ super() }}
+  {{ form.select('info__' ~ position ~ '__strip_extra_white',
+    label=_('Strip Extra Leading and Trailing White Space'), options=[
+    {'text': 'Yes', 'value': true},
+    {'text': 'No', 'value': false},
+    ], selected=field.get('info', {}).get('strip_extra_white')) }}
+{% endblock %}
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
@@ -227,7 +227,7 @@ def test_boston_311(self, Session):
                 None,
                 u"ONTIME",
                 u"Open",
-                u" ",
+                None,  # " " transforms to None
                 u"Street Light Outages",
                 u"Public Works Department",
                 u"Street Lights",
@@ -259,14 +259,14 @@ def test_boston_311(self, Session):
                 None,
                 u"ONTIME",
                 u"Open",
-                u" ",
+                None,  # " " transforms to None
                 u"Graffiti Removal",
                 u"Property Management",
                 u"Graffiti",
                 u"Graffiti Removal",
                 u"PROP_GRAF_GraffitiRemoval",
                 u"PROP",
-                u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",
+                u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",  # strip white spaces
                 None,
                 u"522 Saratoga St  East Boston  MA  02128",
                 u"1",
@@ -291,14 +291,14 @@ def test_boston_311(self, Session):
                 None,
                 u"ONTIME",
                 u"Open",
-                u" ",
+                None,  # " " transforms to None
                 u"Graffiti Removal",
                 u"Property Management",
                 u"Graffiti",
                 u"Graffiti Removal",
                 u"PROP_GRAF_GraffitiRemoval",
                 u"PROP",
-                u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",
+                u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg",  # strip white spaces
                 None,
                 u"965 Bennington St  East Boston  MA  02128",
                 u"1",
@@ -1088,7 +1088,7 @@ def test_boston_311(self, Session):
                 u"Graffiti Removal",
                 u"PROP_GRAF_GraffitiRemoval",
                 u"PROP",
-                u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",   # strip white spaces
+                u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg",  # strip white spaces
                 u"",
                 u"522 Saratoga St  East Boston  MA  02128",
                 Decimal("1"),

diff --git a/ckanext/xloader/validators.py b/ckanext/xloader/validators.py
@@ -0,0 +1,12 @@
+from ckan.plugins.toolkit import asbool
+
+
+def datastore_fields_validator(value, context):
+    if 'strip_extra_white' not in value:
+        # default to True
+        value['strip_extra_white'] = True
+
+    # bool value for strip_extra_white
+    value['strip_extra_white'] = asbool(value['strip_extra_white'])
+
+    return value