diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index aabaefbe..233a46e6 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -9,6 +9,7 @@ from decimal import Decimal import psycopg2 +from chardet.universaldetector import UniversalDetector from six.moves import zip from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException from unidecode import unidecode @@ -30,6 +31,8 @@ MAX_COLUMN_LENGTH = 63 tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES +ISO_8859_ENCODING = 'latin1' + class UnknownEncodingStream(object): """ Provides a context manager that wraps a Tabulator stream @@ -40,36 +43,55 @@ class UnknownEncodingStream(object): only to run into problems later in the file. """ - def __init__(self, filepath, file_format, **kwargs): + def __init__(self, filepath, file_format, decoding_result, **kwargs): self.filepath = filepath self.file_format = file_format self.stream_args = kwargs + self.decoding_result = decoding_result # {'encoding': 'EUC-JP', 'confidence': 0.99} def __enter__(self): try: - self.stream = Stream(self.filepath, format=self.file_format, - **self.stream_args).__enter__() + + if (self.decoding_result and self.decoding_result['confidence'] and self.decoding_result['confidence'] > 0.7): + self.stream = Stream(self.filepath, format=self.file_format, encoding=self.decoding_result['encoding'], + ** self.stream_args).__enter__() + else: + self.stream = Stream(self.filepath, format=self.file_format, ** self.stream_args).__enter__() + except (EncodingError, UnicodeDecodeError): self.stream = Stream(self.filepath, format=self.file_format, - encoding='latin1', **self.stream_args).__enter__() + encoding=ISO_8859_ENCODING, **self.stream_args).__enter__() return self.stream def __exit__(self, *args): return self.stream.__exit__(*args) +def detect_encoding(file_path): + detector = UniversalDetector() + with open(file_path, 'rb') as file: + for line in file: + detector.feed(line) + if detector.done: + break + detector.close() + return detector.result # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99} + + def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): '''Loads a CSV into DataStore. Does not create the indexes.''' + decoding_result = detect_encoding(csv_filepath) + logger.info("load_csv: Decoded encoding: %s", decoding_result) # Determine the header row try: file_format = os.path.splitext(csv_filepath)[1].strip('.') - with UnknownEncodingStream(csv_filepath, file_format) as stream: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException: try: file_format = mimetype.lower().split('/')[-1] - with UnknownEncodingStream(csv_filepath, file_format) as stream: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException as e: raise LoaderError('Tabulator error: {}'.format(e)) @@ -100,11 +122,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False) try: - with UnknownEncodingStream(csv_filepath, file_format, - skip_rows=skip_rows) as stream: - stream.save(target=f_write.name, format='csv', encoding='utf-8', - delimiter=delimiter) - csv_filepath = f_write.name + save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} + try: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result, + skip_rows=skip_rows) as stream: + stream.save(**save_args) + except (EncodingError, UnicodeDecodeError): + with Stream(csv_filepath, format=file_format, encoding=ISO_8859_ENCODING, + skip_rows=skip_rows) as stream: + stream.save(**save_args) + csv_filepath = f_write.name # datastore db connection engine = get_write_engine() @@ -263,15 +290,17 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): # Determine the header row logger.info('Determining column names and types') + decoding_result = detect_encoding(table_filepath) + logger.info("load_table: Decoded encoding: %s", decoding_result) try: file_format = os.path.splitext(table_filepath)[1].strip('.') - with UnknownEncodingStream(table_filepath, file_format, + with UnknownEncodingStream(table_filepath, file_format, decoding_result, post_parse=[TypeConverter().convert_types]) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException: try: file_format = mimetype.lower().split('/')[-1] - with UnknownEncodingStream(table_filepath, file_format, + with UnknownEncodingStream(table_filepath, file_format, decoding_result, post_parse=[TypeConverter().convert_types]) as stream: header_offset, headers = headers_guess(stream.sample) except TabulatorException as e: @@ -309,7 +338,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] type_converter = TypeConverter(types=types) - with UnknownEncodingStream(table_filepath, file_format, + with UnknownEncodingStream(table_filepath, file_format, decoding_result, skip_rows=skip_rows, post_parse=[type_converter.convert_types]) as stream: def row_iterator(): diff --git a/requirements.txt b/requirements.txt index 58540beb..fe92b6d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ six>=1.12.0 tabulator==1.53.5 Unidecode==1.0.22 python-dateutil>=2.8.2 +chardet==5.2.0 \ No newline at end of file