ckan · ThrawnCA · Jun 25, 2024 · Jan 15, 2024 · Jan 29, 2024 · Feb 12, 2024
diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml
@@ -48,6 +48,14 @@ groups:
         type: bool
         required: false
         legacy_key: ckanext.xloader.just_load_with_messytables
+      - key: ckanext.xloader.strict_type_guessing
+        default: True
+        example: False
+        description: |
+            Use with ckanext.xloader.use_type_guessing to set strict true or false
+            for type guessing. If set to False, the types will always fallback to string type.
+        type: bool
+        required: false
       - key: ckanext.xloader.parse_dates_dayfirst
         default: False
         example: False

diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -3,6 +3,7 @@
 
 import datetime
 import itertools
+from six import text_type as str, binary_type
 import os
 import os.path
 import tempfile
@@ -266,7 +267,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     skip_rows = list(range(1, header_offset + 2))
 
     TYPES, TYPE_MAPPING = get_types()
-    types = type_guess(stream.sample[1:], types=TYPES, strict=True)
+    strict_guessing = p.toolkit.asbool(
+        config.get('ckanext.xloader.strict_type_guessing', True))
+    types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)
 
     # override with types user requested
     if existing_info:
@@ -333,12 +336,17 @@ def row_iterator():
 
 
 _TYPE_MAPPING = {
+    "<type 'str'>": 'text',
     "<type 'unicode'>": 'text',
+    "<type 'bytes'>": 'text',
     "<type 'bool'>": 'text',
     "<type 'int'>": 'numeric',
     "<type 'float'>": 'numeric',
     "<class 'decimal.Decimal'>": 'numeric',
+    "<type 'datetime.datetime'>": 'timestamp',
     "<class 'str'>": 'text',
+    "<class 'unicode'>": 'text',
+    "<class 'bytes'>": 'text',
     "<class 'bool'>": 'text',
     "<class 'int'>": 'numeric',
     "<class 'float'>": 'numeric',
@@ -347,7 +355,7 @@ def row_iterator():
 
 
 def get_types():
-    _TYPES = [int, bool, str, datetime.datetime, float, Decimal]
+    _TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]
     TYPE_MAPPING = config.get('TYPE_MAPPING', _TYPE_MAPPING)
     return _TYPES, TYPE_MAPPING
 

diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
@@ -3,13 +3,17 @@
 import json
 import datetime
 
+from six import text_type as str, binary_type
+
 from ckan import model
 from ckan.lib import search
 from collections import defaultdict
 from decimal import Decimal
 
 import ckan.plugins as p
 
+from .job_exceptions import JobError
+
 
 def resource_data(id, resource_id):
 
@@ -149,7 +153,7 @@ def headers_guess(rows, tolerance=1):
     return 0, []
 
 
-TYPES = [int, bool, str, datetime.datetime, float, Decimal]
+TYPES = [int, bool, str, binary_type, datetime.datetime, float, Decimal]
 
 
 def type_guess(rows, types=TYPES, strict=False):
@@ -210,5 +214,7 @@ def type_guess(rows, types=TYPES, strict=False):
         # element in case of a tie
         # See: http://stackoverflow.com/a/6783101/214950
         guesses_tuples = [(t, guess[t]) for t in types if t in guess]
+        if not guesses_tuples:
+            raise JobError('Failed to guess types')
         _columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
     return _columns