Use chardet to confidently guess encoding and use it if above 70% con…

…fidence Also have fallback to windows encoding if all else fails
qld-gov-au · Nov 1, 2023 · f9ef556 · f9ef556
1 parent 12d1445
commit f9ef556
Show file tree

Hide file tree

Showing 2 changed files with 44 additions and 14 deletions.
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -9,6 +9,7 @@
 from decimal import Decimal
 
 import psycopg2
+from chardet.universaldetector import UniversalDetector
 from six.moves import zip
 from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
 from unidecode import unidecode
@@ -30,6 +31,8 @@
 MAX_COLUMN_LENGTH = 63
 tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
 
+ISO_8859_ENCODING = 'latin1'
+
 
 class UnknownEncodingStream(object):
     """ Provides a context manager that wraps a Tabulator stream
@@ -40,36 +43,55 @@ class UnknownEncodingStream(object):
     only to run into problems later in the file.
     """
 
-    def __init__(self, filepath, file_format, **kwargs):
+    def __init__(self, filepath, file_format, decoding_result, **kwargs):
         self.filepath = filepath
         self.file_format = file_format
         self.stream_args = kwargs
+        self.decoding_result = decoding_result  # {'encoding': 'EUC-JP', 'confidence': 0.99}
 
     def __enter__(self):
         try:
-            self.stream = Stream(self.filepath, format=self.file_format,
-                                 **self.stream_args).__enter__()
+
+            if (self.decoding_result and self.decoding_result['confidence'] and self.decoding_result['confidence'] > 0.7):
+                self.stream = Stream(self.filepath, format=self.file_format, encoding=self.decoding_result['encoding'],
+                                     ** self.stream_args).__enter__()
+            else:
+                self.stream = Stream(self.filepath, format=self.file_format, ** self.stream_args).__enter__()
+
         except (EncodingError, UnicodeDecodeError):
             self.stream = Stream(self.filepath, format=self.file_format,
-                                 encoding='latin1', **self.stream_args).__enter__()
+                                 encoding=ISO_8859_ENCODING, **self.stream_args).__enter__()
         return self.stream
 
     def __exit__(self, *args):
         return self.stream.__exit__(*args)
 
 
+def detect_encoding(file_path):
+    detector = UniversalDetector()
+    with open(file_path, 'rb') as file:
+        for line in file:
+            detector.feed(line)
+            if detector.done:
+                break
+    detector.close()
+    return detector.result  # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99}
+
+
 def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     '''Loads a CSV into DataStore. Does not create the indexes.'''
 
+    decoding_result = detect_encoding(csv_filepath)
+    logger.info("load_csv: Decoded encoding: %s", decoding_result)
     # Determine the header row
     try:
         file_format = os.path.splitext(csv_filepath)[1].strip('.')
-        with UnknownEncodingStream(csv_filepath, file_format) as stream:
+        with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
             header_offset, headers = headers_guess(stream.sample)
     except TabulatorException:
         try:
             file_format = mimetype.lower().split('/')[-1]
-            with UnknownEncodingStream(csv_filepath, file_format) as stream:
+            with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
                 header_offset, headers = headers_guess(stream.sample)
         except TabulatorException as e:
             raise LoaderError('Tabulator error: {}'.format(e))
@@ -100,11 +122,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     logger.info('Ensuring character coding is UTF8')
     f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
     try:
-        with UnknownEncodingStream(csv_filepath, file_format,
-                                   skip_rows=skip_rows) as stream:
-            stream.save(target=f_write.name, format='csv', encoding='utf-8',
-                        delimiter=delimiter)
-            csv_filepath = f_write.name
+        save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
+        try:
+            with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
+                                       skip_rows=skip_rows) as stream:
+                stream.save(**save_args)
+        except (EncodingError, UnicodeDecodeError):
+            with Stream(csv_filepath, format=file_format, encoding=ISO_8859_ENCODING,
+                        skip_rows=skip_rows) as stream:
+                stream.save(**save_args)
+        csv_filepath = f_write.name
 
         # datastore db connection
         engine = get_write_engine()
@@ -263,15 +290,17 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
 
     # Determine the header row
     logger.info('Determining column names and types')
+    decoding_result = detect_encoding(table_filepath)
+    logger.info("load_table: Decoded encoding: %s", decoding_result)
     try:
         file_format = os.path.splitext(table_filepath)[1].strip('.')
-        with UnknownEncodingStream(table_filepath, file_format,
+        with UnknownEncodingStream(table_filepath, file_format, decoding_result,
                                    post_parse=[TypeConverter().convert_types]) as stream:
             header_offset, headers = headers_guess(stream.sample)
     except TabulatorException:
         try:
             file_format = mimetype.lower().split('/')[-1]
-            with UnknownEncodingStream(table_filepath, file_format,
+            with UnknownEncodingStream(table_filepath, file_format, decoding_result,
                                        post_parse=[TypeConverter().convert_types]) as stream:
                 header_offset, headers = headers_guess(stream.sample)
         except TabulatorException as e:
@@ -309,7 +338,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
     headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
     type_converter = TypeConverter(types=types)
 
-    with UnknownEncodingStream(table_filepath, file_format,
+    with UnknownEncodingStream(table_filepath, file_format, decoding_result,
                                skip_rows=skip_rows,
                                post_parse=[type_converter.convert_types]) as stream:
         def row_iterator():

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ six>=1.12.0
 tabulator==1.53.5
 Unidecode==1.0.22
 python-dateutil>=2.8.2
+chardet==5.2.0