From 1c1f7e5ed6e90afdeef0bf80465e2e702155c0c4 Mon Sep 17 00:00:00 2001 From: ThrawnCA Date: Mon, 23 Oct 2023 15:53:27 +1000 Subject: [PATCH] [QOLSVC-2984] handle Latin-1 encoding if UTF-8 fails --- ckanext/xloader/loader.py | 50 +++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 4da314a8..9381b85f 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -10,7 +10,7 @@ import psycopg2 from six.moves import zip -from tabulator import config as tabulator_config, Stream, TabulatorException +from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException from unidecode import unidecode import ckan.plugins as p @@ -31,18 +31,46 @@ tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES +class UnknownEncodingStream(object): + """ Provides a context manager that wraps a Tabulator stream + and tries multiple encodings if one fails. + + This is particularly relevant in cases like Latin-1 encoding, + which is usually ASCII and thus the sample could be sniffed as UTF-8, + only to run into problems later in the file. + """ + + def __init__(self, filepath, file_format, **kwargs): + self.filepath = filepath + self.file_format = file_format + self.stream_args = kwargs + + def __enter__(self): + try: + self.stream = Stream(self.filepath, format=self.file_format, + **self.stream_args) + except EncodingError: + self.stream = Stream(self.filepath, format=self.file_format, + encoding='latin1', **self.stream_args) + self.stream = self.stream.__enter__() + return self.stream + + def __exit__(self, *args): + return self.stream.__exit__(*args) + + def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads a CSV into DataStore. Does not create the indexes.''' # Determine the header row try: file_format = os.path.splitext(csv_filepath)[1].strip('.') - with Stream(csv_filepath, format=file_format) as stream: + with UnknownEncodingStream(csv_filepath, file_format) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException: try: file_format = mimetype.lower().split('/')[-1] - with Stream(csv_filepath, format=file_format) as stream: + with UnknownEncodingStream(csv_filepath, file_format) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException as e: raise LoaderError('Tabulator error: {}'.format(e)) @@ -73,7 +101,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False) try: - with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream: + with UnknownEncodingStream(csv_filepath, file_format, + skip_rows=skip_rows) as stream: stream.save(target=f_write.name, format='csv', encoding='utf-8', delimiter=delimiter) csv_filepath = f_write.name @@ -237,14 +266,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Determining column names and types') try: file_format = os.path.splitext(table_filepath)[1].strip('.') - with Stream(table_filepath, format=file_format, - post_parse=[TypeConverter().convert_types]) as stream: + with UnknownEncodingStream(table_filepath, file_format, + post_parse=[TypeConverter().convert_types]) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException: try: file_format = mimetype.lower().split('/')[-1] - with Stream(table_filepath, format=file_format, - post_parse=[TypeConverter().convert_types]) as stream: + with UnknownEncodingStream(table_filepath, file_format, + post_parse=[TypeConverter().convert_types]) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException as e: raise LoaderError('Tabulator error: {}'.format(e)) @@ -281,8 +310,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] type_converter = TypeConverter(types=types) - with Stream(table_filepath, format=file_format, skip_rows=skip_rows, - post_parse=[type_converter.convert_types]) as stream: + with UnknownEncodingStream(table_filepath, file_format, + skip_rows=skip_rows, + post_parse=[type_converter.convert_types]) as stream: def row_iterator(): for row in stream: data_row = {}