From 348c1d81753cf1f86d9d2c24bdb51b12113e36ad Mon Sep 17 00:00:00 2001 From: Julius Schlensok Date: Tue, 6 Aug 2024 14:34:57 +0000 Subject: [PATCH 1/2] chore(tests): switch Parquet tests from pytest to unittest --- tests/unit_tests/test_parquet.py | 102 +++++++++++++++---------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/tests/unit_tests/test_parquet.py b/tests/unit_tests/test_parquet.py index 61e9cf9..2b6fb82 100644 --- a/tests/unit_tests/test_parquet.py +++ b/tests/unit_tests/test_parquet.py @@ -1,12 +1,13 @@ import shutil import sys +import tempfile +import unittest from contextlib import nullcontext from pathlib import Path import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -import pytest import scipy if "typeguard" in sys.modules: @@ -15,87 +16,82 @@ from spectrum_io.file import parquet -class TestParquet: +class TestParquet(unittest.TestCase): """Test class to check Parquet file I/O.""" - def test_read_file(self, raw_data, tmpdir): + def setUp(self): # noqa: D102 + # Simple toy MS data containing float, list[float], str, int, and list[int] + self.raw_data = { + "scan_number": [1, 234, 5678], + "intensities": [ + [4e-5, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, -1.0, 0.4], + [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1.0, 0.05], + [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 2e-3, 0.0, 0.0, 0.13], + ], + "sequence": ["SVFLTFLR", "KTSQIFLAK", "SPVGRVTPKEWR"], + "precursor_charge_onehot": [ + [0, 1, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + ], + "collision_energy_normed": [0.250827308624, 0.288798207462, 0.2887064038764], + } + self.temp_dir = Path(tempfile.mkdtemp()) + + def tearDown(self): # noqa: D102 + shutil.rmtree(self.temp_dir) + + def test_read_file(self): """Test read operation for a single dataset.""" - output_path = Path(tmpdir / "table.parquet") - pq.write_table(pa.Table.from_pydict(raw_data), output_path) + output_path = self.temp_dir / "table.parquet" + pq.write_table(pa.Table.from_pydict(self.raw_data), output_path) df = parquet.read_file(output_path) - output_path.unlink() - pd.testing.assert_frame_equal(df, pd.DataFrame(raw_data)) + pd.testing.assert_frame_equal(df, pd.DataFrame(self.raw_data)) - def test_write_file(self, raw_data, tmpdir): + def test_write_file(self): """Check write operation for a single dataset.""" - output_path = Path(tmpdir / "table.parquet") - df = pd.DataFrame(raw_data) + output_path = self.temp_dir / "table.parquet" + df = pd.DataFrame(self.raw_data) parquet.write_file(df, output_path) pd.testing.assert_frame_equal(df, pd.read_parquet(output_path)) - output_path.unlink() - def test_read_write_partition(self, raw_data, tmpdir): + def test_read_write_partition(self): """Check whether data is unmodified after being written to and then read from a partitioned dataset.""" - output_path = Path(tmpdir / "partition") - df = pd.DataFrame(raw_data) + output_path = self.temp_dir / "partition" + df = pd.DataFrame(self.raw_data) parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"]) read_df = parquet.read_partition(output_path, "dataset_1") - shutil.rmtree(output_path) pd.testing.assert_frame_equal(read_df, df) - def test_read_write_partition_integer_key(self, raw_data, tmpdir): + def test_read_write_partition_integer_key(self): """Check whether Parquet's under-the-hood conversion of string to integer keys is handled seamlessly.""" - output_path = Path(tmpdir / "partition") - df = pd.DataFrame(raw_data) + output_path = self.temp_dir / "partition" + df = pd.DataFrame(self.raw_data) parquet.write_partition([df, df], output_path, ["1", "2"]) read_df = parquet.read_partition(output_path, "1") - shutil.rmtree(output_path) pd.testing.assert_frame_equal(read_df, df) - def test_modify_partition(self, raw_data, tmpdir): + def test_modify_partition(self): """Check whether file content stays the same when writing new data to the same partitioned directory.""" - output_path = Path(tmpdir / "partition") - df = pd.DataFrame(raw_data) + output_path = self.temp_dir / "partition" + df = pd.DataFrame(self.raw_data) parquet.write_partition([df, df], output_path, ["1", "2"]) parquet.write_partition([df, df, df], output_path, ["1", "2", "3"]) read_df = parquet.read_partition(output_path, "2") - shutil.rmtree(output_path) pd.testing.assert_frame_equal(read_df, df) - def test_write_not_implemented(self, raw_data, tmpdir): + def test_write_not_implemented(self): """Check whether write_file() raises a NotImplementedError if provided with an unsupported object.""" - with pytest.raises(NotImplementedError): + with self.assertRaises(NotImplementedError): with suppress_type_checks() if "typeguard" in sys.modules else nullcontext(): - output_path = Path(tmpdir / "table.parquet") - df = pd.DataFrame(raw_data).to_numpy() + output_path = self.temp_dir / "table.parquet" + df = pd.DataFrame(self.raw_data).to_numpy() parquet.write_file(df, output_path) - output_path.unlink() - def test_read_write_partition_not_implemented(self, raw_data, tmpdir): + def test_read_write_partition_not_implemented(self): """Check whether write_partition() raises a NotImplementedError if provided with an unsupported object.""" - with pytest.raises(NotImplementedError): + with self.assertRaises(NotImplementedError): with suppress_type_checks() if "typeguard" in sys.modules else nullcontext(): - output_path = Path(tmpdir / "partition") - df = pd.DataFrame(raw_data).to_numpy() + output_path = self.temp_dir / "partition" + df = pd.DataFrame(self.raw_data).to_numpy() parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"]) - output_path.unlink() - - -@pytest.fixture -def raw_data(): - """Simple toy MS data containing float, list[float], str, int, and list[int].""" - return { - "scan_number": [1, 234, 5678], - "intensities": [ - [4e-5, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, -1.0, 0.4], - [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1.0, 0.05], - [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 2e-3, 0.0, 0.0, 0.13], - ], - "sequence": ["SVFLTFLR", "KTSQIFLAK", "SPVGRVTPKEWR"], - "precursor_charge_onehot": [ - [0, 1, 0, 0, 0, 0], - [0, 1, 0, 0, 0, 0], - [0, 0, 1, 0, 0, 0], - ], - "collision_energy_normed": [0.250827308624, 0.288798207462, 0.2887064038764], - } From 1b9194d2590f852e0506a59a64f536ab49bf121a Mon Sep 17 00:00:00 2001 From: Julius Schlensok Date: Tue, 6 Aug 2024 14:35:12 +0000 Subject: [PATCH 2/2] refactor(parquet IO): drop unnecessary file format support Remove TODOs for sparse matrices, anndata matrices and HuggingFace datasets as their conversion from/to columnar format for serialization as Parquet files is context-dependent. Correspondingly remove tests for NotImplementedErrors as they are now redundant --- spectrum_io/file/parquet.py | 27 ++++++--------------------- tests/unit_tests/test_parquet.py | 16 ---------------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/spectrum_io/file/parquet.py b/spectrum_io/file/parquet.py index 8fc164f..b3837de 100644 --- a/spectrum_io/file/parquet.py +++ b/spectrum_io/file/parquet.py @@ -7,12 +7,7 @@ import pyarrow.parquet as pq import scipy -# TODO add sparse matrix / anndata support -# TODO add speed benchmarks -# TODO add support for HuggingFace datasets API - Pathlike = Union[Path, str] -Dataset = Union[pd.DataFrame, scipy.sparse.spmatrix] logger = logging.getLogger(__name__) @@ -53,36 +48,26 @@ def read_partition(path: Pathlike, dataset_name: str) -> pd.DataFrame: return df -def write_file(data: Dataset, path: Pathlike) -> None: +def write_file(data: pd.DataFrame, path: Pathlike) -> None: """Writes a single DataFrame or matrix to a Parquet file. :param data: Data to store :param path: Path to write the Parquet file to - - :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset """ - if isinstance(data, pd.DataFrame): - data.to_parquet(path) - else: - raise NotImplementedError + data.to_parquet(path) -def write_partition(datasets: List[Dataset], path: Pathlike, dataset_names: List[str]) -> None: +def write_partition(datasets: List[pd.DataFrame], path: Pathlike, dataset_names: List[str]) -> None: """ - Write several datasets to a Parquet dataset as a directory containing subdirectories partinioned by dataset name. + Write several datasets to a Parquet dataset as a directory containing subdirectories partitioned by dataset name. :param datasets: Datasets to write :param path: Root path to write the partitioned dataset to :param dataset_names: Names to assign to the datasets for retrieval. Careful: If all of these are strings of ints, Parquet will convert them to raw integers! - - :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset """ - if all(isinstance(x, pd.DataFrame) for x in datasets): - df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)]) - table = pa.Table.from_pandas(df) - else: - raise NotImplementedError + df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)]) + table = pa.Table.from_pandas(df) if isinstance(path, str): path = Path(path) diff --git a/tests/unit_tests/test_parquet.py b/tests/unit_tests/test_parquet.py index 2b6fb82..b89f7c5 100644 --- a/tests/unit_tests/test_parquet.py +++ b/tests/unit_tests/test_parquet.py @@ -79,19 +79,3 @@ def test_modify_partition(self): parquet.write_partition([df, df, df], output_path, ["1", "2", "3"]) read_df = parquet.read_partition(output_path, "2") pd.testing.assert_frame_equal(read_df, df) - - def test_write_not_implemented(self): - """Check whether write_file() raises a NotImplementedError if provided with an unsupported object.""" - with self.assertRaises(NotImplementedError): - with suppress_type_checks() if "typeguard" in sys.modules else nullcontext(): - output_path = self.temp_dir / "table.parquet" - df = pd.DataFrame(self.raw_data).to_numpy() - parquet.write_file(df, output_path) - - def test_read_write_partition_not_implemented(self): - """Check whether write_partition() raises a NotImplementedError if provided with an unsupported object.""" - with self.assertRaises(NotImplementedError): - with suppress_type_checks() if "typeguard" in sys.modules else nullcontext(): - output_path = self.temp_dir / "partition" - df = pd.DataFrame(self.raw_data).to_numpy() - parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"])