diff --git a/docs/changelog/next_release/161.feature.rst b/docs/changelog/next_release/161.feature.rst new file mode 100644 index 00000000..a1fde077 --- /dev/null +++ b/docs/changelog/next_release/161.feature.rst @@ -0,0 +1 @@ +Add compression options to XML file format \ No newline at end of file diff --git a/syncmaster/dto/transfers.py b/syncmaster/dto/transfers.py index 68012179..a8a2469a 100644 --- a/syncmaster/dto/transfers.py +++ b/syncmaster/dto/transfers.py @@ -36,17 +36,24 @@ class FileTransferDTO(TransferDTO): def __post_init__(self): if isinstance(self.file_format, dict): - self.file_format = self._get_format(self.file_format.copy()) + self.file_format = self._get_file_format(self.file_format.copy()) if isinstance(self.df_schema, str): self.df_schema = json.loads(self.df_schema) - def _get_format(self, file_format: dict): - file_type = file_format.pop("type", None) + def _get_file_format(self, file_format: dict) -> CSV | JSONLine | JSON | Excel | XML | ORC | Parquet: + file_type = self._prepare_file_format(file_format) parser_class = self._format_parsers.get(file_type) if parser_class is not None: return parser_class.parse_obj(file_format) raise ValueError(f"Unknown file type: {file_type}") + @staticmethod + def _prepare_file_format(file_format: dict) -> str | None: + file_type = file_format.pop("type", None) + if file_type == "xml" and file_format.get("compression") == "none": + file_format.pop("compression") + return file_type + @dataclass class PostgresTransferDTO(DBTransferDTO): diff --git a/syncmaster/schemas/v1/transfers/file_format.py b/syncmaster/schemas/v1/transfers/file_format.py index 07ecffd7..c8090b78 100644 --- a/syncmaster/schemas/v1/transfers/file_format.py +++ b/syncmaster/schemas/v1/transfers/file_format.py @@ -49,6 +49,14 @@ class CSVCompression(str, Enum): DEFLATE = "deflate" +class XMLCompression(str, Enum): + NONE = "none" + BZIP2 = "bzip2" + GZIP = "gzip" + LZ4 = "lz4" + SNAPPY = "snappy" + + class CSV(BaseModel): type: CSV_FORMAT delimiter: str = "," @@ -84,6 +92,7 @@ class XML(BaseModel): type: XML_FORMAT root_tag: str row_tag: str + compression: XMLCompression = XMLCompression.GZIP class ORC(BaseModel): diff --git a/tests/test_integration/test_run_transfer/test_hdfs.py b/tests/test_integration/test_run_transfer/test_hdfs.py index 9b3b78b3..3ca511eb 100644 --- a/tests/test_integration/test_run_transfer/test_hdfs.py +++ b/tests/test_integration/test_run_transfer/test_hdfs.py @@ -229,8 +229,8 @@ async def test_run_transfer_hdfs_to_postgres( id="parquet", ), pytest.param( - ("xml", {}), - "without_compression", + ("xml", {"compression": "snappy"}), + "with_compression", id="xml", ), ], diff --git a/tests/test_integration/test_run_transfer/test_s3.py b/tests/test_integration/test_run_transfer/test_s3.py index 2a017ffd..c0c1a28d 100644 --- a/tests/test_integration/test_run_transfer/test_s3.py +++ b/tests/test_integration/test_run_transfer/test_s3.py @@ -230,8 +230,8 @@ async def test_run_transfer_s3_to_postgres( id="parquet", ), pytest.param( - ("xml", {}), - "without_compression", + ("xml", {"compression": "none"}), + "with_compression", id="xml", ), ], diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py index 107ac763..ac52b021 100644 --- a/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py +++ b/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py @@ -59,6 +59,7 @@ "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "lz4", }, "options": { "some": "option", @@ -166,6 +167,7 @@ async def test_developer_plus_can_create_s3_transfer( "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "lz4", }, "orc": { "type": "orc", @@ -221,6 +223,7 @@ async def test_developer_plus_can_create_s3_transfer( "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "bzip2", }, }, { @@ -320,6 +323,7 @@ async def test_developer_plus_can_create_hdfs_transfer( "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "bzip2", }, "orc": { "type": "orc", diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py index c5df6229..f1909082 100644 --- a/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py +++ b/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py @@ -41,6 +41,7 @@ "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "bzip2", }, "options": {}, }, diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py index bfbb32fe..a2467807 100644 --- a/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py +++ b/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py @@ -41,6 +41,7 @@ "type": "xml", "root_tag": "data", "row_tag": "record", + "compression": "bzip2", }, "options": {}, },