From cbb6bdefeaf61535aeeaff000a453ee91e6d54d4 Mon Sep 17 00:00:00 2001 From: "ayush.goyal" Date: Wed, 2 Aug 2023 12:00:11 +0530 Subject: [PATCH] [Importer] Handle malformed data to improve delimiter guessing --- desktop/libs/indexer/src/indexer/file_format.py | 1 + .../src/indexer/indexers/morphline_tests.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/desktop/libs/indexer/src/indexer/file_format.py b/desktop/libs/indexer/src/indexer/file_format.py index d6979114d37..f985eb9ffd8 100644 --- a/desktop/libs/indexer/src/indexer/file_format.py +++ b/desktop/libs/indexer/src/indexer/file_format.py @@ -350,6 +350,7 @@ def _valid_character(self, char): @classmethod def _guess_dialect(cls, sample): sniffer = csv.Sniffer() + sample = sample.replace('\r\n', '\n') dialect = sniffer.sniff(sample if isinstance(sample, str) else sample.decode('utf-8')) has_header = cls._hasHeader(sniffer, sample, dialect) return dialect, has_header diff --git a/desktop/libs/indexer/src/indexer/indexers/morphline_tests.py b/desktop/libs/indexer/src/indexer/indexers/morphline_tests.py index 337f6fb6bb9..97bdedc2e7a 100644 --- a/desktop/libs/indexer/src/indexer/indexers/morphline_tests.py +++ b/desktop/libs/indexer/src/indexer/indexers/morphline_tests.py @@ -102,6 +102,18 @@ class TestIndexer(object): 'hasHeader': True, 'quoteChar': '"' } + maldformedCSV = '''02Q,Titan Airways\r\n04Q,Tradewind Aviation\r\n05Q,"Comlux Aviation, AG"\r\n06Q,Master Top Linhas Aereas Ltd.\r\n''' + '''07Q,Flair Airlines Ltd.\r\n09Q,"Swift Air, LLC"\r\n0BQ,DCA\r\n0CQ,ACM AIR CHARTER GmbH\r\n0FQ,"Maine Aviation Aircraft Charter, LLC"''' + '''\r\n0GQ,"Inter Island Airways, d/b/a Inter Island Air"\r\n0HQ,Polar Airlines de Mexico d/b/a Nova Air\r\n0J,JetClub AG''' + '''\r\n0JQ,Vision Airlines\r\n0KQ,"Mokulele Flight Services, Inc."\r\n0LQ,"Metropix UK, LLP."''' + '''\r\n0MQ,"Multi-Aero, Inc. d/b/a Air Choice One"\r\n0Q,Flying Service N.V.\r\n16,PSA Airlines Inc.\r\n17,Piedmont Airlines\r\n''' + maldformedCSVFormat = { + 'type': 'csv', + 'fieldSeparator': ',', + 'recordSeparator': '\n', + 'hasHeader': False, + 'quoteChar': '"' + } def setUp(self): self.c = make_logged_in_client(is_superuser=False) @@ -117,15 +129,19 @@ def tearDown(self): def test_guess_csv_format(self): stream = string_io(TestIndexer.simpleCSVString) + malformedstream = string_io(TestIndexer.maldformedCSV) indexer = MorphlineIndexer(self.user, solr_client=self.solr_client) guessed_format = indexer.guess_format({'file': {"stream": stream, "name": "test.csv"}}) + processed_guess_format = indexer.guess_format({'file': {"stream": malformedstream, "name": "test.csv"}}) fields = indexer.guess_field_types({"file": {"stream": stream, "name": "test.csv"}, "format": guessed_format})['columns'] # test format expected_format = self.simpleCSVFormat + expected_format_for_malformedCSV = self.maldformedCSVFormat assert_equal(expected_format, guessed_format) + assert_equal(expected_format_for_malformedCSV, processed_guess_format) # test fields expected_fields = self.simpleCSVFields