Skip to content

Commit

Permalink
Fix for storing dataframes with no data and only columns (#3)
Browse files Browse the repository at this point in the history
Fixes #2 

The pytables library does not store anything for empty dataframe
and read_hdf excepts in this case. To fix this we check beforehand
if the HDF5 file is empty and in this case create the dataframe
from the columns attribute
  • Loading branch information
janssenhenning authored Sep 13, 2022
1 parent cf20084 commit 61e5f84
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
12 changes: 9 additions & 3 deletions aiida_dataframe/data/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def _update_dataframe(self, df: pd.DataFrame, filename: str | None = None) -> No
self.set_file(file, filename=filename)

self.set_attribute("_pandas_data_hash", self._hash_dataframe(df))
self.set_attribute("index", list(self.df.index))
self.set_attribute("columns", list(self.df.columns.to_flat_index()))
self.set_attribute("index", list(df.index))
self.set_attribute("columns", list(df.columns.to_flat_index()))

@staticmethod
def _hash_dataframe(df):
Expand All @@ -82,7 +82,13 @@ def _get_dataframe_from_repo(self) -> pd.DataFrame:
# Copy the content of source to target in chunks
shutil.copyfileobj(file, temp_handle) # type: ignore[arg-type]

return pd.read_hdf(Path(td) / self.filename)
# Workaround for empty dataframe
with pd.HDFStore(
Path(td) / self.filename, mode="r", errors="strict"
) as store:
if len(store.groups()) == 0:
return pd.DataFrame([], columns=self.get_attribute("columns"))
return pd.read_hdf(store)

def _get_dataframe(self) -> pd.DataFrame:
"""
Expand Down
22 changes: 22 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,3 +408,25 @@ def test_setitem_modification(entry_point):
loaded = load_node(node.pk)
assert loaded is not node
assert_frame_equal(loaded.df, df_changed)


@pytest.mark.parametrize(
"entry_point",
("dataframe.frame",),
)
def test_empty_dataframe(entry_point):
"""
Test that storing an empty dataframe works
"""

PandasFrameData = DataFactory(entry_point)

# Example from pandas Docs
df = pd.DataFrame([], columns=["A", "B"])

node = PandasFrameData(df)
node.store()

loaded = load_node(node.pk)
assert loaded is not node
assert_frame_equal(loaded.df, df)

0 comments on commit 61e5f84

Please sign in to comment.