From f396f0d41754fd15d9fcf349c3bd39def6ee50d0 Mon Sep 17 00:00:00 2001 From: matt garber Date: Mon, 24 Jun 2024 15:23:46 -0400 Subject: [PATCH] Switch csv generation to QUOTE_MINIMAL (#117) --- src/handlers/site_upload/powerset_merge.py | 4 +--- tests/site_upload/test_powerset_merge.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/handlers/site_upload/powerset_merge.py b/src/handlers/site_upload/powerset_merge.py index 4fcc283..d687b5a 100644 --- a/src/handlers/site_upload/powerset_merge.py +++ b/src/handlers/site_upload/powerset_merge.py @@ -228,8 +228,6 @@ def generate_csv_from_parquet(bucket_name: str, bucket_root: str, subbucket_path last_valid_df = last_valid_df.apply( lambda x: x.strip() if isinstance(x, str) else x ).replace('""', numpy.nan) - # Here we are removing internal commas from fields so we get a valid unquoted CSV - last_valid_df = last_valid_df.replace(to_replace=",", value="", regex=True) awswrangler.s3.to_csv( last_valid_df, ( @@ -238,7 +236,7 @@ def generate_csv_from_parquet(bucket_name: str, bucket_root: str, subbucket_path ) ), index=False, - quoting=csv.QUOTE_NONE, + quoting=csv.QUOTE_MINIMAL, ) diff --git a/tests/site_upload/test_powerset_merge.py b/tests/site_upload/test_powerset_merge.py index 083a8e6..694bbca 100644 --- a/tests/site_upload/test_powerset_merge.py +++ b/tests/site_upload/test_powerset_merge.py @@ -377,7 +377,7 @@ def test_parquet_to_csv(mock_bucket): ) assert list(df["race"].dropna().unique()) == [ "White", - "Black or African American", + "Black, or African American", "Asian", - "American Indian or Alaska Native", + "American Indian, or Alaska Native", ]