From 348c1d81753cf1f86d9d2c24bdb51b12113e36ad Mon Sep 17 00:00:00 2001
From: Julius Schlensok <julius@schlensok.org>
Date: Tue, 6 Aug 2024 14:34:57 +0000
Subject: [PATCH 1/2] chore(tests): switch Parquet tests from pytest to
 unittest

---
 tests/unit_tests/test_parquet.py | 102 +++++++++++++++----------------
 1 file changed, 49 insertions(+), 53 deletions(-)

diff --git a/tests/unit_tests/test_parquet.py b/tests/unit_tests/test_parquet.py
index 61e9cf9..2b6fb82 100644
--- a/tests/unit_tests/test_parquet.py
+++ b/tests/unit_tests/test_parquet.py
@@ -1,12 +1,13 @@
 import shutil
 import sys
+import tempfile
+import unittest
 from contextlib import nullcontext
 from pathlib import Path
 
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
-import pytest
 import scipy
 
 if "typeguard" in sys.modules:
@@ -15,87 +16,82 @@
 from spectrum_io.file import parquet
 
 
-class TestParquet:
+class TestParquet(unittest.TestCase):
     """Test class to check Parquet file I/O."""
 
-    def test_read_file(self, raw_data, tmpdir):
+    def setUp(self):  # noqa: D102
+        # Simple toy MS data containing float, list[float], str, int, and list[int]
+        self.raw_data = {
+            "scan_number": [1, 234, 5678],
+            "intensities": [
+                [4e-5, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, -1.0, 0.4],
+                [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1.0, 0.05],
+                [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 2e-3, 0.0, 0.0, 0.13],
+            ],
+            "sequence": ["SVFLTFLR", "KTSQIFLAK", "SPVGRVTPKEWR"],
+            "precursor_charge_onehot": [
+                [0, 1, 0, 0, 0, 0],
+                [0, 1, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0],
+            ],
+            "collision_energy_normed": [0.250827308624, 0.288798207462, 0.2887064038764],
+        }
+        self.temp_dir = Path(tempfile.mkdtemp())
+
+    def tearDown(self):  # noqa: D102
+        shutil.rmtree(self.temp_dir)
+
+    def test_read_file(self):
         """Test read operation for a single dataset."""
-        output_path = Path(tmpdir / "table.parquet")
-        pq.write_table(pa.Table.from_pydict(raw_data), output_path)
+        output_path = self.temp_dir / "table.parquet"
+        pq.write_table(pa.Table.from_pydict(self.raw_data), output_path)
         df = parquet.read_file(output_path)
-        output_path.unlink()
-        pd.testing.assert_frame_equal(df, pd.DataFrame(raw_data))
+        pd.testing.assert_frame_equal(df, pd.DataFrame(self.raw_data))
 
-    def test_write_file(self, raw_data, tmpdir):
+    def test_write_file(self):
         """Check write operation for a single dataset."""
-        output_path = Path(tmpdir / "table.parquet")
-        df = pd.DataFrame(raw_data)
+        output_path = self.temp_dir / "table.parquet"
+        df = pd.DataFrame(self.raw_data)
         parquet.write_file(df, output_path)
         pd.testing.assert_frame_equal(df, pd.read_parquet(output_path))
-        output_path.unlink()
 
-    def test_read_write_partition(self, raw_data, tmpdir):
+    def test_read_write_partition(self):
         """Check whether data is unmodified after being written to and then read from a partitioned dataset."""
-        output_path = Path(tmpdir / "partition")
-        df = pd.DataFrame(raw_data)
+        output_path = self.temp_dir / "partition"
+        df = pd.DataFrame(self.raw_data)
         parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"])
         read_df = parquet.read_partition(output_path, "dataset_1")
-        shutil.rmtree(output_path)
         pd.testing.assert_frame_equal(read_df, df)
 
-    def test_read_write_partition_integer_key(self, raw_data, tmpdir):
+    def test_read_write_partition_integer_key(self):
         """Check whether Parquet's under-the-hood conversion of string to integer keys is handled seamlessly."""
-        output_path = Path(tmpdir / "partition")
-        df = pd.DataFrame(raw_data)
+        output_path = self.temp_dir / "partition"
+        df = pd.DataFrame(self.raw_data)
         parquet.write_partition([df, df], output_path, ["1", "2"])
         read_df = parquet.read_partition(output_path, "1")
-        shutil.rmtree(output_path)
         pd.testing.assert_frame_equal(read_df, df)
 
-    def test_modify_partition(self, raw_data, tmpdir):
+    def test_modify_partition(self):
         """Check whether file content stays the same when writing new data to the same partitioned directory."""
-        output_path = Path(tmpdir / "partition")
-        df = pd.DataFrame(raw_data)
+        output_path = self.temp_dir / "partition"
+        df = pd.DataFrame(self.raw_data)
         parquet.write_partition([df, df], output_path, ["1", "2"])
         parquet.write_partition([df, df, df], output_path, ["1", "2", "3"])
         read_df = parquet.read_partition(output_path, "2")
-        shutil.rmtree(output_path)
         pd.testing.assert_frame_equal(read_df, df)
 
-    def test_write_not_implemented(self, raw_data, tmpdir):
+    def test_write_not_implemented(self):
         """Check whether write_file() raises a NotImplementedError if provided with an unsupported object."""
-        with pytest.raises(NotImplementedError):
+        with self.assertRaises(NotImplementedError):
             with suppress_type_checks() if "typeguard" in sys.modules else nullcontext():
-                output_path = Path(tmpdir / "table.parquet")
-                df = pd.DataFrame(raw_data).to_numpy()
+                output_path = self.temp_dir / "table.parquet"
+                df = pd.DataFrame(self.raw_data).to_numpy()
                 parquet.write_file(df, output_path)
-                output_path.unlink()
 
-    def test_read_write_partition_not_implemented(self, raw_data, tmpdir):
+    def test_read_write_partition_not_implemented(self):
         """Check whether write_partition() raises a NotImplementedError if provided with an unsupported object."""
-        with pytest.raises(NotImplementedError):
+        with self.assertRaises(NotImplementedError):
             with suppress_type_checks() if "typeguard" in sys.modules else nullcontext():
-                output_path = Path(tmpdir / "partition")
-                df = pd.DataFrame(raw_data).to_numpy()
+                output_path = self.temp_dir / "partition"
+                df = pd.DataFrame(self.raw_data).to_numpy()
                 parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"])
-                output_path.unlink()
-
-
-@pytest.fixture
-def raw_data():
-    """Simple toy MS data containing float, list[float], str, int, and list[int]."""
-    return {
-        "scan_number": [1, 234, 5678],
-        "intensities": [
-            [4e-5, 0.0, -1.0, 0.0, 0.0, -1.0, 0.03, 0.0, -1.0, 0.4],
-            [0.3, 0.0, -1.0, 1.0, 0.0, -1.0, 0.4, 0.0, -1.0, 0.05],
-            [0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 2e-3, 0.0, 0.0, 0.13],
-        ],
-        "sequence": ["SVFLTFLR", "KTSQIFLAK", "SPVGRVTPKEWR"],
-        "precursor_charge_onehot": [
-            [0, 1, 0, 0, 0, 0],
-            [0, 1, 0, 0, 0, 0],
-            [0, 0, 1, 0, 0, 0],
-        ],
-        "collision_energy_normed": [0.250827308624, 0.288798207462, 0.2887064038764],
-    }

From 1b9194d2590f852e0506a59a64f536ab49bf121a Mon Sep 17 00:00:00 2001
From: Julius Schlensok <julius@schlensok.org>
Date: Tue, 6 Aug 2024 14:35:12 +0000
Subject: [PATCH 2/2] refactor(parquet IO): drop unnecessary file format
 support

Remove TODOs for sparse matrices, anndata matrices and HuggingFace
datasets as their conversion from/to columnar format for serialization
as Parquet files is context-dependent.
Correspondingly remove tests for NotImplementedErrors as they are now
redundant
---
 spectrum_io/file/parquet.py      | 27 ++++++---------------------
 tests/unit_tests/test_parquet.py | 16 ----------------
 2 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/spectrum_io/file/parquet.py b/spectrum_io/file/parquet.py
index 8fc164f..b3837de 100644
--- a/spectrum_io/file/parquet.py
+++ b/spectrum_io/file/parquet.py
@@ -7,12 +7,7 @@
 import pyarrow.parquet as pq
 import scipy
 
-# TODO add sparse matrix / anndata support
-# TODO add speed benchmarks
-# TODO add support for HuggingFace datasets API
-
 Pathlike = Union[Path, str]
-Dataset = Union[pd.DataFrame, scipy.sparse.spmatrix]
 
 logger = logging.getLogger(__name__)
 
@@ -53,36 +48,26 @@ def read_partition(path: Pathlike, dataset_name: str) -> pd.DataFrame:
     return df
 
 
-def write_file(data: Dataset, path: Pathlike) -> None:
+def write_file(data: pd.DataFrame, path: Pathlike) -> None:
     """Writes a single DataFrame or matrix to a Parquet file.
 
     :param data: Data to store
     :param path: Path to write the Parquet file to
-
-    :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
     """
-    if isinstance(data, pd.DataFrame):
-        data.to_parquet(path)
-    else:
-        raise NotImplementedError
+    data.to_parquet(path)
 
 
-def write_partition(datasets: List[Dataset], path: Pathlike, dataset_names: List[str]) -> None:
+def write_partition(datasets: List[pd.DataFrame], path: Pathlike, dataset_names: List[str]) -> None:
     """
-    Write several datasets to a Parquet dataset as a directory containing subdirectories partinioned by dataset name.
+    Write several datasets to a Parquet dataset as a directory containing subdirectories partitioned by dataset name.
 
     :param datasets: Datasets to write
     :param path: Root path to write the partitioned dataset to
     :param dataset_names: Names to assign to the datasets for retrieval. Careful: If all of these are strings of ints,
         Parquet will convert them to raw integers!
-
-    :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
     """
-    if all(isinstance(x, pd.DataFrame) for x in datasets):
-        df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
-        table = pa.Table.from_pandas(df)
-    else:
-        raise NotImplementedError
+    df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
+    table = pa.Table.from_pandas(df)
 
     if isinstance(path, str):
         path = Path(path)
diff --git a/tests/unit_tests/test_parquet.py b/tests/unit_tests/test_parquet.py
index 2b6fb82..b89f7c5 100644
--- a/tests/unit_tests/test_parquet.py
+++ b/tests/unit_tests/test_parquet.py
@@ -79,19 +79,3 @@ def test_modify_partition(self):
         parquet.write_partition([df, df, df], output_path, ["1", "2", "3"])
         read_df = parquet.read_partition(output_path, "2")
         pd.testing.assert_frame_equal(read_df, df)
-
-    def test_write_not_implemented(self):
-        """Check whether write_file() raises a NotImplementedError if provided with an unsupported object."""
-        with self.assertRaises(NotImplementedError):
-            with suppress_type_checks() if "typeguard" in sys.modules else nullcontext():
-                output_path = self.temp_dir / "table.parquet"
-                df = pd.DataFrame(self.raw_data).to_numpy()
-                parquet.write_file(df, output_path)
-
-    def test_read_write_partition_not_implemented(self):
-        """Check whether write_partition() raises a NotImplementedError if provided with an unsupported object."""
-        with self.assertRaises(NotImplementedError):
-            with suppress_type_checks() if "typeguard" in sys.modules else nullcontext():
-                output_path = self.temp_dir / "partition"
-                df = pd.DataFrame(self.raw_data).to_numpy()
-                parquet.write_partition([df, df], output_path, ["dataset_1", "dataset_2"])