Skip to content

Commit 50032ad

Browse files
Fix: Revert index type to str and handle empty DFs (#32)
1 parent 0229246 commit 50032ad

File tree

4 files changed

+35
-8
lines changed

4 files changed

+35
-8
lines changed

data_validation_framework/task.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,10 @@ def kwargs(self):
295295
def read_dataset(self):
296296
"""Import the dataset to a :class:`pandas.DataFrame`.
297297
298-
Note that the index column is loaded as a string.
299-
300298
This method can be overridden to load custom data (e.g. GeoDataFrame, etc.).
301299
The dataset should always be loaded from the path given by `self.dataset_df`.
302300
"""
303-
return pd.read_csv(
304-
self.dataset_df, index_col=self.input_index_col, dtype={self.input_index_col: str}
305-
)
301+
return pd.read_csv(self.dataset_df, index_col=self.input_index_col)
306302

307303
def pre_process(self, df, args, kwargs):
308304
"""Method executed before applying the external function."""

data_validation_framework/util.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ def message_worker(progress_bar, message_queue):
9292

9393
def apply_to_df(df, func, *args, nb_processes=None, redirect_stdout=None, **kwargs):
9494
"""Apply a function to df rows using tqdm."""
95+
if df.empty:
96+
return df
9597
nb_jobs = len(df)
9698
if redirect_stdout is None:
9799
redirect_stdout = True

tests/test_task.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,9 +1288,7 @@ def check_exception(failed_task, exception): # pylint: disable=unused-variable
12881288
assert not luigi.build([failing_task], local_scheduler=True)
12891289

12901290
assert failed_tasks == [str(failing_task)]
1291-
assert exceptions == [
1292-
str(IndexError("The following index values are duplicated: ['0', '1']"))
1293-
]
1291+
assert exceptions == [str(IndexError("The following index values are duplicated: [0, 1]"))]
12941292

12951293
def test_change_index(self, tmpdir, TestTask):
12961294
"""Test that the process fails if the index is changed by the validation function."""
@@ -1743,6 +1741,27 @@ def inputs(self):
17431741
== redirect_stdout
17441742
)
17451743

1744+
@pytest.fixture
1745+
def empty_dataset_df_path(self, tmpdir):
1746+
"""Create an empty CSV dataset and return its path."""
1747+
dataset_df_path = tmpdir / "dataset.csv"
1748+
base_dataset_df = pd.DataFrame({"a": [], "b": []})
1749+
base_dataset_df.to_csv(dataset_df_path)
1750+
1751+
return str(dataset_df_path)
1752+
1753+
def test_defaults_with_empty(self, TestTask, empty_dataset_df_path, tmpdir):
1754+
"""Test that the empty dataset is properly processed."""
1755+
assert luigi.build(
1756+
[TestTask(dataset_df=empty_dataset_df_path, result_path=str(tmpdir / "out"))],
1757+
local_scheduler=True,
1758+
)
1759+
result = pd.read_csv(tmpdir / "out" / "TestTask" / "report.csv")
1760+
assert result["is_valid"].tolist() == []
1761+
assert result["ret_code"].tolist() == []
1762+
assert result["comment"].tolist() == []
1763+
assert result["exception"].tolist() == []
1764+
17461765

17471766
class TestValidationWorkflow:
17481767
"""Test the data_validation_framework.task.ValidationWorkflow class."""

tests/test_util.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,13 @@ def test_apply_to_df(nb_processes, redirect_stdout):
133133
re.match(r" File \"(\/.*?\.[\w:]+)\", line \d+, in _tested_func", exception_lines[3])
134134
is not None
135135
)
136+
137+
# Test with an empty DF
138+
emty_res = util.apply_to_df(
139+
df.loc[[]],
140+
_tested_func,
141+
"val1",
142+
"val2",
143+
)
144+
145+
pd.testing.assert_frame_equal(emty_res, df.loc[[]])

0 commit comments

Comments
 (0)