Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions featuretools/computational_backends/feature_set_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,21 @@ def _calculate_transform_features(
# apply the function to the relevant dataframe slice and add the
# feature row to the results dataframe.
if f.primitive.uses_calc_time:
values = feature_func(*column_data, time=self.time_last)
# If time-based, set to NaN for rows with missing time index
time_index = self.entityset[self.feature_set.target_df_name].ww.time_index
if time_index is not None and time_index in frame.columns:
missing_time_mask = frame[time_index].isna()
values = feature_func(*column_data, time=self.time_last)
# Set to NaN where time index is missing
if isinstance(values, pd.Series):
values = values.copy()
values[missing_time_mask] = np.nan
else:
# If not a Series, convert to Series for masking
values = pd.Series(values, index=frame.index)
values[missing_time_mask] = np.nan
else:
values = feature_func(*column_data, time=self.time_last)
else:
values = feature_func(*column_data)

Expand Down Expand Up @@ -734,15 +748,15 @@ def last_n(df):
# column twice, wrap it in a partial to avoid
# duplicate functions
funcname = str(id(func))
if "{}-{}".format(column_id, funcname) in agg_rename:
if f"{column_id}-{funcname}" in agg_rename:
func = partial(func)
funcname = str(id(func))

func.__name__ = funcname

to_agg[column_id].append(func)
# this is used below to rename columns that pandas names for us
agg_rename["{}-{}".format(column_id, funcname)] = f.get_name()
agg_rename[f"{column_id}-{funcname}"] = f.get_name()
continue

to_apply.add(f)
Expand All @@ -761,6 +775,16 @@ def last_n(df):
sort=False,
group_keys=False,
).apply(wrap)
# Set to NaN for time-based features where time index is missing
for f in to_apply:
if f.primitive.uses_calc_time:
time_index = self.entityset[self.feature_set.target_df_name].ww.time_index
if time_index is not None and time_index in frame.columns:
missing_time_mask = frame[time_index].isna()
for name in f.get_feature_names():
if name in to_merge.columns:
to_merge[name] = to_merge[name].copy()
to_merge.loc[missing_time_mask, name] = np.nan
frame = pd.merge(
left=frame,
right=to_merge,
Expand Down Expand Up @@ -798,6 +822,17 @@ def last_n(df):
)
to_merge.index = to_merge.index.astype(object).astype(categories)

# Set to NaN for time-based features where time index is missing
for f in features:
if f.primitive.uses_calc_time:
time_index = self.entityset[self.feature_set.target_df_name].ww.time_index
if time_index is not None and time_index in frame.columns:
missing_time_mask = frame[time_index].isna()
for name in f.get_feature_names():
if name in to_merge.columns:
to_merge[name] = to_merge[name].copy()
to_merge.loc[missing_time_mask, name] = np.nan

frame = pd.merge(
left=frame,
right=to_merge,
Expand Down
23 changes: 19 additions & 4 deletions featuretools/entityset/entityset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,16 +1416,30 @@ def _handle_time(
Filter a dataframe for all instances before time_last.
If the dataframe does not have a time index, return the original
dataframe.
Modified: Retain rows with missing time indices (NaN/NaT) so that non-time-based features can still be computed.
"""

schema = self[dataframe_name].ww.schema
if schema.time_index:
df_empty = df.empty
if time_last is not None and not df_empty:
# Identify rows with missing time index
missing_time_mask = df[schema.time_index].isna()
if missing_time_mask.any():
import warnings
warnings.warn(
f"DataFrame '{dataframe_name}' contains rows with missing time indices. Time-based features will be NaN for these rows.",
UserWarning,
)
# Only filter rows with valid time index
valid_time_mask = ~missing_time_mask
if include_cutoff_time:
df = df[df[schema.time_index] <= time_last]
time_mask = df[schema.time_index] <= time_last
else:
df = df[df[schema.time_index] < time_last]
time_mask = df[schema.time_index] < time_last
# Combine: keep rows with missing time index, and rows with valid time index that pass the filter
combined_mask = missing_time_mask | (valid_time_mask & time_mask)
df = df[combined_mask]
if training_window is not None:
training_window = _check_timedelta(training_window)
if include_cutoff_time:
Expand All @@ -1442,9 +1456,10 @@ def _handle_time(
else:
warnings.warn(
"Using training_window but last_time_index is "
"not set for dataframe %s" % (dataframe_name),
f"not set for dataframe {dataframe_name}",
)

# Again, keep rows with missing time index
mask = mask | df[schema.time_index].isna()
df = df[mask]

secondary_time_indexes = schema.metadata.get("secondary_time_index") or {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
Sum,
TimeSinceLast,
Trend,
TimeSincePrevious,
)
from featuretools.primitives.base import AggregationPrimitive
from featuretools.primitives.standard.aggregation.num_unique import NumUnique
Expand Down Expand Up @@ -1202,3 +1203,37 @@ def test_nunique_nested_with_agg_bug(es):
df = calculator.run(np.array([0]))

assert df.iloc[0, 0].round(4) == 1.6667


def test_missing_time_index_rows():
# Create a simple DataFrame with a time index and a missing value
df = pd.DataFrame({
'id': [1, 2, 3, 4],
'value': [10, 20, 30, 40],
'time': [datetime(2020, 1, 1), pd.NaT, datetime(2020, 1, 3), datetime(2020, 1, 4)]
})
es = EntitySet()
es.add_dataframe(
dataframe_name='test',
dataframe=df,
index='id',
time_index='time',
)

# Non-time-based feature
f_value = Feature(es['test'].ww['value'])
# Time-based feature (transform primitive, base is time column)
f_time_since = Feature(
es['test'].ww['time'],
primitive=TimeSincePrevious
)

feature_matrix = calculate_feature_matrix([f_value, f_time_since], es)
feature_matrix_sorted = feature_matrix.sort_index()

# Check that non-time-based feature is computed for all rows
assert feature_matrix_sorted['value'].tolist() == [10, 20, 30, 40]
# Check that time-based feature is NaN for the first row and for the row with missing time index
is_nan = feature_matrix_sorted[f_time_since.get_name()].isna().tolist()
# The first row and the second row (id=2, which has NaT in time) should be NaN for time-based feature
assert is_nan == [True, True, False, False]