featuretools/computational_backends/feature_set_calculator.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -506,7 +506,21 @@ def _calculate_transform_features(
  
                # apply the function to the relevant dataframe slice and add the

                # feature row to the results dataframe.

                if f.primitive.uses_calc_time:

                    values = feature_func(*column_data, time=self.time_last)

                    # If time-based, set to NaN for rows with missing time index

                    time_index = self.entityset[self.feature_set.target_df_name].ww.time_index

                    if time_index is not None and time_index in frame.columns:

                        missing_time_mask = frame[time_index].isna()

                        values = feature_func(*column_data, time=self.time_last)

                        # Set to NaN where time index is missing

                        if isinstance(values, pd.Series):

                            values = values.copy()

                            values[missing_time_mask] = np.nan

                        else:

                            # If not a Series, convert to Series for masking

                            values = pd.Series(values, index=frame.index)

                            values[missing_time_mask] = np.nan

                    else:

                        values = feature_func(*column_data, time=self.time_last)

                else:

                    values = feature_func(*column_data)

    @@ -734,15 +748,15 @@ def last_n(df):
  
                            # column twice, wrap it in a partial to avoid

                            # duplicate functions

                            funcname = str(id(func))

                            if "{}-{}".format(column_id, funcname) in agg_rename:

                            if f"{column_id}-{funcname}" in agg_rename:

                                func = partial(func)

                                funcname = str(id(func))

                            func.__name__ = funcname

                        to_agg[column_id].append(func)

                        # this is used below to rename columns that pandas names for us

                        agg_rename["{}-{}".format(column_id, funcname)] = f.get_name()

                        agg_rename[f"{column_id}-{funcname}"] = f.get_name()

                        continue

                    to_apply.add(f)

    @@ -761,6 +775,16 @@ def last_n(df):
  
                        sort=False,

                        group_keys=False,

                    ).apply(wrap)

                    # Set to NaN for time-based features where time index is missing

                    for f in to_apply:

                        if f.primitive.uses_calc_time:

                            time_index = self.entityset[self.feature_set.target_df_name].ww.time_index

                            if time_index is not None and time_index in frame.columns:

                                missing_time_mask = frame[time_index].isna()

                                for name in f.get_feature_names():

                                    if name in to_merge.columns:

                                        to_merge[name] = to_merge[name].copy()

                                        to_merge.loc[missing_time_mask, name] = np.nan

                    frame = pd.merge(

                        left=frame,

                        right=to_merge,

    @@ -798,6 +822,17 @@ def last_n(df):
  
                        )

                        to_merge.index = to_merge.index.astype(object).astype(categories)

                    # Set to NaN for time-based features where time index is missing

                    for f in features:

                        if f.primitive.uses_calc_time:

                            time_index = self.entityset[self.feature_set.target_df_name].ww.time_index

                            if time_index is not None and time_index in frame.columns:

                                missing_time_mask = frame[time_index].isna()

                                for name in f.get_feature_names():

                                    if name in to_merge.columns:

                                        to_merge[name] = to_merge[name].copy()

                                        to_merge.loc[missing_time_mask, name] = np.nan

                    frame = pd.merge(

                        left=frame,

                        right=to_merge,

featuretools/entityset/entityset.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -1416,16 +1416,30 @@ def _handle_time(
  
            Filter a dataframe for all instances before time_last.

            If the dataframe does not have a time index, return the original

            dataframe.

            Modified: Retain rows with missing time indices (NaN/NaT) so that non-time-based features can still be computed.

            """

            schema = self[dataframe_name].ww.schema

            if schema.time_index:

                df_empty = df.empty

                if time_last is not None and not df_empty:

                    # Identify rows with missing time index

                    missing_time_mask = df[schema.time_index].isna()

                    if missing_time_mask.any():

                        import warnings

                        warnings.warn(

                            f"DataFrame '{dataframe_name}' contains rows with missing time indices. Time-based features will be NaN for these rows.",

                            UserWarning,

                        )

                    # Only filter rows with valid time index

                    valid_time_mask = ~missing_time_mask

                    if include_cutoff_time:

                        df = df[df[schema.time_index] <= time_last]

                        time_mask = df[schema.time_index] <= time_last

                    else:

                        df = df[df[schema.time_index] < time_last]

                        time_mask = df[schema.time_index] < time_last

                    # Combine: keep rows with missing time index, and rows with valid time index that pass the filter

                    combined_mask = missing_time_mask | (valid_time_mask & time_mask)

                    df = df[combined_mask]

                    if training_window is not None:

                        training_window = _check_timedelta(training_window)

                        if include_cutoff_time:

    @@ -1442,9 +1456,10 @@ def _handle_time(
  
                        else:

                            warnings.warn(

                                "Using training_window but last_time_index is "

                                "not set for dataframe %s" % (dataframe_name),

                                f"not set for dataframe {dataframe_name}",

                            )

                        # Again, keep rows with missing time index

                        mask = mask | df[schema.time_index].isna()

                        df = df[mask]

            secondary_time_indexes = schema.metadata.get("secondary_time_index") or {}

featuretools/tests/computational_backend/test_feature_set_calculator.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,7 @@ @@
         Sum,
         TimeSinceLast,
         Trend,
+        TimeSincePrevious,
     )
     from featuretools.primitives.base import AggregationPrimitive
     from featuretools.primitives.standard.aggregation.num_unique import NumUnique
@@ Expand Down Expand Up / @@ -1202,3 +1203,37 @@ def test_nunique_nested_with_agg_bug(es): @@
         df = calculator.run(np.array([0]))
         assert df.iloc[0, 0].round(4) == 1.6667
+    def test_missing_time_index_rows():
+        # Create a simple DataFrame with a time index and a missing value
+        df = pd.DataFrame({
+            'id': [1, 2, 3, 4],
+            'value': [10, 20, 30, 40],
+            'time': [datetime(2020, 1, 1), pd.NaT, datetime(2020, 1, 3), datetime(2020, 1, 4)]
+        })
+        es = EntitySet()
+        es.add_dataframe(
+            dataframe_name='test',
+            dataframe=df,
+            index='id',
+            time_index='time',
+        )
+        # Non-time-based feature
+        f_value = Feature(es['test'].ww['value'])
+        # Time-based feature (transform primitive, base is time column)
+        f_time_since = Feature(
+            es['test'].ww['time'],
+            primitive=TimeSincePrevious
+        )
+        feature_matrix = calculate_feature_matrix([f_value, f_time_since], es)
+        feature_matrix_sorted = feature_matrix.sort_index()
+        # Check that non-time-based feature is computed for all rows
+        assert feature_matrix_sorted['value'].tolist() == [10, 20, 30, 40]
+        # Check that time-based feature is NaN for the first row and for the row with missing time index
+        is_nan = feature_matrix_sorted[f_time_since.get_name()].isna().tolist()
+        # The first row and the second row (id=2, which has NaT in time) should be NaN for time-based feature
+        assert is_nan == [True, True, False, False]

Fix: Only set time-based features to NaN for rows with missing time index (closes #2700)Fix: Only set time-based features to NaN for rows with missing time i… #2763

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

realranjan wants to merge 1 commit into alteryx:main from realranjan:fix-missing-time-index-2700

-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,7 @@ @@
         Sum,
         TimeSinceLast,
         Trend,
+        TimeSincePrevious,
     )
     from featuretools.primitives.base import AggregationPrimitive
     from featuretools.primitives.standard.aggregation.num_unique import NumUnique
@@ Expand Down Expand Up / @@ -1202,3 +1203,37 @@ def test_nunique_nested_with_agg_bug(es): @@
         df = calculator.run(np.array([0]))
         assert df.iloc[0, 0].round(4) == 1.6667
+    def test_missing_time_index_rows():
+        # Create a simple DataFrame with a time index and a missing value
+        df = pd.DataFrame({
+            'id': [1, 2, 3, 4],
+            'value': [10, 20, 30, 40],
+            'time': [datetime(2020, 1, 1), pd.NaT, datetime(2020, 1, 3), datetime(2020, 1, 4)]
+        })
+        es = EntitySet()
+        es.add_dataframe(
+            dataframe_name='test',
+            dataframe=df,
+            index='id',
+            time_index='time',
+        )
+        # Non-time-based feature
+        f_value = Feature(es['test'].ww['value'])
+        # Time-based feature (transform primitive, base is time column)
+        f_time_since = Feature(
+            es['test'].ww['time'],
+            primitive=TimeSincePrevious
+        )
+        feature_matrix = calculate_feature_matrix([f_value, f_time_since], es)
+        feature_matrix_sorted = feature_matrix.sort_index()
+        # Check that non-time-based feature is computed for all rows
+        assert feature_matrix_sorted['value'].tolist() == [10, 20, 30, 40]
+        # Check that time-based feature is NaN for the first row and for the row with missing time index
+        is_nan = feature_matrix_sorted[f_time_since.get_name()].isna().tolist()
+        # The first row and the second row (id=2, which has NaT in time) should be NaN for time-based feature
+        assert is_nan == [True, True, False, False]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix: Only set time-based features to NaN for rows with missing time index (closes #2700)Fix: Only set time-based features to NaN for rows with missing time i… #2763

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Fix: Only set time-based features to NaN for rows with missing time index (closes #2700)Fix: Only set time-based features to NaN for rows with missing time i… #2763

Are you sure you want to change the base?

Uh oh!

Fix: Only set time-based features to NaN for rows with missing time index (closes #2700)Fix: Only set time-based features to NaN for rows with missing time i… #2763

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing