Skip to content

Commit

Permalink
[Importer] Handle malformed data to improve delimiter guessing
Browse files Browse the repository at this point in the history
  • Loading branch information
agl29 committed Aug 2, 2023
1 parent 0c47195 commit 5c7b44e
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
1 change: 1 addition & 0 deletions desktop/libs/indexer/src/indexer/file_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def _valid_character(self, char):
@classmethod
def _guess_dialect(cls, sample):
sniffer = csv.Sniffer()
sample = sample.replace('\r\n', '\n')
dialect = sniffer.sniff(sample if isinstance(sample, str) else sample.decode('utf-8'))
has_header = cls._hasHeader(sniffer, sample, dialect)
return dialect, has_header
Expand Down
16 changes: 16 additions & 0 deletions desktop/libs/indexer/src/indexer/indexers/morphline_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ class TestIndexer(object):
'hasHeader': True,
'quoteChar': '"'
}
simpleCSVString1 = '''02Q,Titan Airways\r\n04Q,Tradewind Aviation\r\n05Q,"Comlux Aviation, AG"\r\n06Q,Master Top Linhas Aereas Ltd.\r\n'''
'''07Q,Flair Airlines Ltd.\r\n09Q,"Swift Air, LLC"\r\n0BQ,DCA\r\n0CQ,ACM AIR CHARTER GmbH\r\n0FQ,"Maine Aviation Aircraft Charter, LLC"'''
'''\r\n0GQ,"Inter Island Airways, d/b/a Inter Island Air"\r\n0HQ,Polar Airlines de Mexico d/b/a Nova Air\r\n0J,JetClub AG'''
'''\r\n0JQ,Vision Airlines\r\n0KQ,"Mokulele Flight Services, Inc."\r\n0LQ,"Metropix UK, LLP."'''
'''\r\n0MQ,"Multi-Aero, Inc. d/b/a Air Choice One"\r\n0Q,Flying Service N.V.\r\n16,PSA Airlines Inc.\r\n17,Piedmont Airlines\r\n'''
simpleCSVFormat1 = {
'type': 'csv',
'fieldSeparator': ',',
'recordSeparator': '\n',
'hasHeader': False,
'quoteChar': '"'
}

def setUp(self):
self.c = make_logged_in_client(is_superuser=False)
Expand All @@ -117,15 +129,19 @@ def tearDown(self):

def test_guess_csv_format(self):
stream = string_io(TestIndexer.simpleCSVString)
stream1 = string_io(TestIndexer.simpleCSVString1)
indexer = MorphlineIndexer(self.user, solr_client=self.solr_client)

guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}})
guessed_format1 = indexer.guess_format({'file': {"stream": stream1, "name": "test.csv"}})

fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns']
# test format
expected_format = self.simpleCSVFormat
expected_format1 = self.simpleCSVFormat1

assert_equal(expected_format, guessed_format)
assert_equal(expected_format1, guessed_format1)

# test fields
expected_fields = self.simpleCSVFields
Expand Down

0 comments on commit 5c7b44e

Please sign in to comment.