From 61e5f84cc495a999e8607ea976aebadb17ae715b Mon Sep 17 00:00:00 2001 From: Henning Janssen Date: Tue, 13 Sep 2022 10:44:26 +0200 Subject: [PATCH] Fix for storing dataframes with no data and only columns (#3) Fixes #2 The pytables library does not store anything for empty dataframe and read_hdf excepts in this case. To fix this we check beforehand if the HDF5 file is empty and in this case create the dataframe from the columns attribute --- aiida_dataframe/data/dataframe.py | 12 +++++++++--- tests/test_data.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/aiida_dataframe/data/dataframe.py b/aiida_dataframe/data/dataframe.py index d5ef4c1..9e6a83b 100644 --- a/aiida_dataframe/data/dataframe.py +++ b/aiida_dataframe/data/dataframe.py @@ -62,8 +62,8 @@ def _update_dataframe(self, df: pd.DataFrame, filename: str | None = None) -> No self.set_file(file, filename=filename) self.set_attribute("_pandas_data_hash", self._hash_dataframe(df)) - self.set_attribute("index", list(self.df.index)) - self.set_attribute("columns", list(self.df.columns.to_flat_index())) + self.set_attribute("index", list(df.index)) + self.set_attribute("columns", list(df.columns.to_flat_index())) @staticmethod def _hash_dataframe(df): @@ -82,7 +82,13 @@ def _get_dataframe_from_repo(self) -> pd.DataFrame: # Copy the content of source to target in chunks shutil.copyfileobj(file, temp_handle) # type: ignore[arg-type] - return pd.read_hdf(Path(td) / self.filename) + # Workaround for empty dataframe + with pd.HDFStore( + Path(td) / self.filename, mode="r", errors="strict" + ) as store: + if len(store.groups()) == 0: + return pd.DataFrame([], columns=self.get_attribute("columns")) + return pd.read_hdf(store) def _get_dataframe(self) -> pd.DataFrame: """ diff --git a/tests/test_data.py b/tests/test_data.py index a87d3b4..d9d03e7 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -408,3 +408,25 @@ def test_setitem_modification(entry_point): loaded = load_node(node.pk) assert loaded is not node assert_frame_equal(loaded.df, df_changed) + + +@pytest.mark.parametrize( + "entry_point", + ("dataframe.frame",), +) +def test_empty_dataframe(entry_point): + """ + Test that storing an empty dataframe works + """ + + PandasFrameData = DataFactory(entry_point) + + # Example from pandas Docs + df = pd.DataFrame([], columns=["A", "B"]) + + node = PandasFrameData(df) + node.store() + + loaded = load_node(node.pk) + assert loaded is not node + assert_frame_equal(loaded.df, df)