[QOLSVC-2984] handle Latin-1 encoding if UTF-8 fails

qld-gov-au · Oct 23, 2023 · 1c1f7e5 · 1c1f7e5
1 parent 5898000
commit 1c1f7e5
Showing 1 changed file with 40 additions and 10 deletions.
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -10,7 +10,7 @@
 
 import psycopg2
 from six.moves import zip
-from tabulator import config as tabulator_config, Stream, TabulatorException
+from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
 from unidecode import unidecode
 
 import ckan.plugins as p
@@ -31,18 +31,46 @@
 tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
 
 
+class UnknownEncodingStream(object):
+    """ Provides a context manager that wraps a Tabulator stream
+    and tries multiple encodings if one fails.
+
+    This is particularly relevant in cases like Latin-1 encoding,
+    which is usually ASCII and thus the sample could be sniffed as UTF-8,
+    only to run into problems later in the file.
+    """
+
+    def __init__(self, filepath, file_format, **kwargs):
+        self.filepath = filepath
+        self.file_format = file_format
+        self.stream_args = kwargs
+
+    def __enter__(self):
+        try:
+            self.stream = Stream(self.filepath, format=self.file_format,
+                                 **self.stream_args)
+        except EncodingError:
+            self.stream = Stream(self.filepath, format=self.file_format,
+                                 encoding='latin1', **self.stream_args)
+        self.stream = self.stream.__enter__()
+        return self.stream
+
+    def __exit__(self, *args):
+        return self.stream.__exit__(*args)
+
+
 def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     '''Loads a CSV into DataStore. Does not create the indexes.'''
 
     # Determine the header row
     try:
         file_format = os.path.splitext(csv_filepath)[1].strip('.')
-        with Stream(csv_filepath, format=file_format) as stream:
+        with UnknownEncodingStream(csv_filepath, file_format) as stream:
             header_offset, headers = headers_guess(stream.sample)
     except TabulatorException:
         try:
             file_format = mimetype.lower().split('/')[-1]
-            with Stream(csv_filepath, format=file_format) as stream:
+            with UnknownEncodingStream(csv_filepath, file_format) as stream:
                 header_offset, headers = headers_guess(stream.sample)
         except TabulatorException as e:
             raise LoaderError('Tabulator error: {}'.format(e))
@@ -73,7 +101,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('Ensuring character coding is UTF8')
     f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
     try:
-        with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream:
+        with UnknownEncodingStream(csv_filepath, file_format,
+                                   skip_rows=skip_rows) as stream:
             stream.save(target=f_write.name, format='csv', encoding='utf-8',
                         delimiter=delimiter)
             csv_filepath = f_write.name
@@ -237,14 +266,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('Determining column names and types')
     try:
         file_format = os.path.splitext(table_filepath)[1].strip('.')
-        with Stream(table_filepath, format=file_format,
-                    post_parse=[TypeConverter().convert_types]) as stream:
+        with UnknownEncodingStream(table_filepath, file_format,
+                                   post_parse=[TypeConverter().convert_types]) as stream:
             header_offset, headers = headers_guess(stream.sample)
     except TabulatorException:
         try:
             file_format = mimetype.lower().split('/')[-1]
-            with Stream(table_filepath, format=file_format,
-                        post_parse=[TypeConverter().convert_types]) as stream:
+            with UnknownEncodingStream(table_filepath, file_format,
+                                       post_parse=[TypeConverter().convert_types]) as stream:
                 header_offset, headers = headers_guess(stream.sample)
         except TabulatorException as e:
             raise LoaderError('Tabulator error: {}'.format(e))
@@ -281,8 +310,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
     type_converter = TypeConverter(types=types)
 
-    with Stream(table_filepath, format=file_format, skip_rows=skip_rows,
-                post_parse=[type_converter.convert_types]) as stream:
+    with UnknownEncodingStream(table_filepath, file_format,
+                               skip_rows=skip_rows,
+                               post_parse=[type_converter.convert_types]) as stream:
         def row_iterator():
             for row in stream:
                 data_row = {}