Skip to content

Commit

Permalink
[QOLSVC-2984] handle Latin-1 encoding if UTF-8 fails
Browse files Browse the repository at this point in the history
  • Loading branch information
ThrawnCA committed Oct 23, 2023
1 parent 5898000 commit 1c1f7e5
Showing 1 changed file with 40 additions and 10 deletions.
50 changes: 40 additions & 10 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import psycopg2
from six.moves import zip
from tabulator import config as tabulator_config, Stream, TabulatorException
from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
from unidecode import unidecode

import ckan.plugins as p
Expand All @@ -31,18 +31,46 @@
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES


class UnknownEncodingStream(object):
""" Provides a context manager that wraps a Tabulator stream
and tries multiple encodings if one fails.
This is particularly relevant in cases like Latin-1 encoding,
which is usually ASCII and thus the sample could be sniffed as UTF-8,
only to run into problems later in the file.
"""

def __init__(self, filepath, file_format, **kwargs):
self.filepath = filepath
self.file_format = file_format
self.stream_args = kwargs

def __enter__(self):
try:
self.stream = Stream(self.filepath, format=self.file_format,
**self.stream_args)
except EncodingError:
self.stream = Stream(self.filepath, format=self.file_format,
encoding='latin1', **self.stream_args)
self.stream = self.stream.__enter__()
return self.stream

def __exit__(self, *args):
return self.stream.__exit__(*args)


def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
'''Loads a CSV into DataStore. Does not create the indexes.'''

# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
with Stream(csv_filepath, format=file_format) as stream:
with UnknownEncodingStream(csv_filepath, file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
with Stream(csv_filepath, format=file_format) as stream:
with UnknownEncodingStream(csv_filepath, file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
Expand Down Expand Up @@ -73,7 +101,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream:
with UnknownEncodingStream(csv_filepath, file_format,
skip_rows=skip_rows) as stream:
stream.save(target=f_write.name, format='csv', encoding='utf-8',
delimiter=delimiter)
csv_filepath = f_write.name
Expand Down Expand Up @@ -237,14 +266,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Determining column names and types')
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
with Stream(table_filepath, format=file_format,
post_parse=[TypeConverter().convert_types]) as stream:
with UnknownEncodingStream(table_filepath, file_format,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
with Stream(table_filepath, format=file_format,
post_parse=[TypeConverter().convert_types]) as stream:
with UnknownEncodingStream(table_filepath, file_format,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
Expand Down Expand Up @@ -281,8 +310,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)

with Stream(table_filepath, format=file_format, skip_rows=skip_rows,
post_parse=[type_converter.convert_types]) as stream:
with UnknownEncodingStream(table_filepath, file_format,
skip_rows=skip_rows,
post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
for row in stream:
data_row = {}
Expand Down

0 comments on commit 1c1f7e5

Please sign in to comment.