Skip to content

Commit

Permalink
Use chardet to confidently guess encoding and use it if above 70% con…
Browse files Browse the repository at this point in the history
…fidence

Also have fallback to windows encoding if all else fails
  • Loading branch information
duttonw committed Nov 1, 2023
1 parent 12d1445 commit f9ef556
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 14 deletions.
57 changes: 43 additions & 14 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from decimal import Decimal

import psycopg2
from chardet.universaldetector import UniversalDetector
from six.moves import zip
from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
from unidecode import unidecode
Expand All @@ -30,6 +31,8 @@
MAX_COLUMN_LENGTH = 63
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES

ISO_8859_ENCODING = 'latin1'


class UnknownEncodingStream(object):
""" Provides a context manager that wraps a Tabulator stream
Expand All @@ -40,36 +43,55 @@ class UnknownEncodingStream(object):
only to run into problems later in the file.
"""

def __init__(self, filepath, file_format, **kwargs):
def __init__(self, filepath, file_format, decoding_result, **kwargs):
self.filepath = filepath
self.file_format = file_format
self.stream_args = kwargs
self.decoding_result = decoding_result # {'encoding': 'EUC-JP', 'confidence': 0.99}

def __enter__(self):
try:
self.stream = Stream(self.filepath, format=self.file_format,
**self.stream_args).__enter__()

if (self.decoding_result and self.decoding_result['confidence'] and self.decoding_result['confidence'] > 0.7):
self.stream = Stream(self.filepath, format=self.file_format, encoding=self.decoding_result['encoding'],
** self.stream_args).__enter__()
else:
self.stream = Stream(self.filepath, format=self.file_format, ** self.stream_args).__enter__()

except (EncodingError, UnicodeDecodeError):
self.stream = Stream(self.filepath, format=self.file_format,
encoding='latin1', **self.stream_args).__enter__()
encoding=ISO_8859_ENCODING, **self.stream_args).__enter__()
return self.stream

def __exit__(self, *args):
return self.stream.__exit__(*args)


def detect_encoding(file_path):
detector = UniversalDetector()
with open(file_path, 'rb') as file:
for line in file:
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99}


def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
'''Loads a CSV into DataStore. Does not create the indexes.'''

decoding_result = detect_encoding(csv_filepath)
logger.info("load_csv: Decoded encoding: %s", decoding_result)
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
with UnknownEncodingStream(csv_filepath, file_format) as stream:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
with UnknownEncodingStream(csv_filepath, file_format) as stream:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
Expand Down Expand Up @@ -100,11 +122,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
with UnknownEncodingStream(csv_filepath, file_format,
skip_rows=skip_rows) as stream:
stream.save(target=f_write.name, format='csv', encoding='utf-8',
delimiter=delimiter)
csv_filepath = f_write.name
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=ISO_8859_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()
Expand Down Expand Up @@ -263,15 +290,17 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):

# Determine the header row
logger.info('Determining column names and types')
decoding_result = detect_encoding(table_filepath)
logger.info("load_table: Decoded encoding: %s", decoding_result)
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
with UnknownEncodingStream(table_filepath, file_format,
with UnknownEncodingStream(table_filepath, file_format, decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
with UnknownEncodingStream(table_filepath, file_format,
with UnknownEncodingStream(table_filepath, file_format, decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
Expand Down Expand Up @@ -309,7 +338,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)

with UnknownEncodingStream(table_filepath, file_format,
with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ six>=1.12.0
tabulator==1.53.5
Unidecode==1.0.22
python-dateutil>=2.8.2
chardet==5.2.0

0 comments on commit f9ef556

Please sign in to comment.