SNOW-1727330, SNOW-1727332, SNOW-1727334, SNOW-1727335: Add support f…

…or DataFrameGroupBy/SeriesGroupBy.bfill/ffill
snowflakedb · Oct 14, 2024 · 81a1d88 · 81a1d88
1 parent 7c22750
commit 81a1d88
Show file tree

Hide file tree

Showing 6 changed files with 449 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,7 @@
 - Added support for applying Snowpark Python functions (e.g., `sin`) in `Series.map`, `Series.apply`, `DataFrame.apply` and `DataFrame.applymap`.
 - Added support for `np.subtract`, `np.multiply`, `np.divide`, and `np.true_divide`.
 - Added support for tracking usages of `__array_ufunc__`.
+- Added support for `DataFrameGroupBy.bfill`, `SeriesGroupBy.bfill`, `DataFrameGroupBy.ffill`, and `SeriesGroupBy.ffill`.
 
 #### Improvements
 

diff --git a/docs/source/modin/supported/groupby_supported.rst b/docs/source/modin/supported/groupby_supported.rst
@@ -80,7 +80,8 @@ Computations/descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``any``                     | P                               | ``N`` for non-integer/boolean types                |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``bfill``                   | N                               |                                                    |
+| ``bfill``                   | P                               | When GroupBy axis is 1,``N``;                      |
+|                             |                                 | GroupBy axis = 0 is fully supported.               |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``corr``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
@@ -104,7 +105,8 @@ Computations/descriptive stats
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``diff``                    | N                               |                                                    |
 +-----------------------------+---------------------------------+----------------------------------------------------+
-| ``ffill``                   | N                               |                                                    |
+| ``ffill``                   | P                               | When GroupBy axis is 1,``N``;                      |
+|                             |                                 | GroupBy axis = 0 is fully supported.               |
 +-----------------------------+---------------------------------+----------------------------------------------------+
 | ``fillna``                  | P                               | GroupBy axis = 0 is supported.                     |
 |                             |                                 | Does not support ``downcast`` parameter            |

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py
@@ -197,7 +197,97 @@ def skew():
         pass
 
     def ffill():
-        pass
+        """
+        Forward fill the values.
+
+        Parameters
+        ----------
+        limit : int, optional
+            Limit of how many values to fill.
+
+        Returns
+        -------
+        Series or DataFrame
+            Object with missing values filled.
+
+        See also
+        --------
+        Series.ffill
+            Returns Series with minimum number of char in object.
+        DataFrame.ffill
+            Object with missing values filled or None if inplace=True.
+        Series.fillna
+            Fill NaN values of a Series.
+        DataFrame.fillna
+            Fill NaN values of a DataFrame.
+
+        Examples
+        --------
+        For SeriesGroupBy:
+
+        >>> key = [0, 0, 1, 1]
+        >>> ser = pd.Series([np.nan, 2, 3, np.nan], index=key)
+        >>> ser
+        0    NaN
+        0    2.0
+        1    3.0
+        1    NaN
+        dtype: float64
+        >>> ser.groupby(level=0).ffill()
+        0    NaN
+        0    2.0
+        1    3.0
+        1    3.0
+        dtype: float64
+
+        For DataFrameGroupBy:
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "key": [0, 0, 1, 1, 1],
+        ...         "A": [np.nan, 2, np.nan, 3, np.nan],
+        ...         "B": [2, 3, np.nan, np.nan, np.nan],
+        ...         "C": [np.nan, np.nan, 2, np.nan, np.nan],
+        ...     }
+        ... )
+        >>> df
+           key    A    B    C
+        0    0  NaN  2.0  NaN
+        1    0  2.0  3.0  NaN
+        2    1  NaN  NaN  2.0
+        3    1  3.0  NaN  NaN
+        4    1  NaN  NaN  NaN
+
+        Propagate non-null values forward or backward within each group along columns.
+
+        >>> df.groupby("key").ffill()
+             A    B    C
+        0  NaN  2.0  NaN
+        1  2.0  3.0  NaN
+        2  NaN  NaN  2.0
+        3  3.0  NaN  2.0
+        4  3.0  NaN  2.0
+
+        Propagate non-null values forward or backward within each group along rows.
+
+        >>> df.T.groupby(np.array([0, 0, 1, 1])).ffill().T
+           key    A    B    C
+        0  0.0  0.0  2.0  2.0
+        1  0.0  2.0  3.0  3.0
+        2  1.0  1.0  NaN  2.0
+        3  1.0  3.0  NaN  NaN
+        4  1.0  1.0  NaN  NaN
+
+        Only replace the first NaN element within a group along rows.
+
+        >>> df.groupby("key").ffill(limit=1)
+             A    B    C
+        0  NaN  2.0  NaN
+        1  2.0  3.0  NaN
+        2  NaN  NaN  2.0
+        3  3.0  NaN  2.0
+        4  3.0  NaN  NaN
+        """
 
     def sem():
         pass
@@ -1086,7 +1176,84 @@ def cummin():
         """
 
     def bfill():
-        pass
+        """
+        Backward fill the values.
+
+        Parameters
+        ----------
+        limit : int, optional
+            Limit of how many values to fill.
+
+        Returns
+        -------
+        Series or DataFrame
+            Object with missing values filled.
+
+        See also
+        -------
+        Series.bfill
+            Backward fill the missing values in the dataset.
+        DataFrame.bfill
+            Backward fill the missing values in the dataset.
+        Series.fillna
+            Fill NaN values of a Series.
+        DataFrame.fillna
+            Fill NaN values of a DataFrame.
+
+        Examples
+        --------
+        With Series:
+
+        >>> index = ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot']
+        >>> s = pd.Series([None, 1, None, None, 3], index=index)
+        >>> s
+        Falcon    NaN
+        Falcon    1.0
+        Parrot    NaN
+        Parrot    NaN
+        Parrot    3.0
+        dtype: float64
+        >>> s.groupby(level=0).bfill()
+        Falcon    1.0
+        Falcon    1.0
+        Parrot    3.0
+        Parrot    3.0
+        Parrot    3.0
+        dtype: float64
+        >>> s.groupby(level=0).bfill(limit=1)
+        Falcon    1.0
+        Falcon    1.0
+        Parrot    NaN
+        Parrot    3.0
+        Parrot    3.0
+        dtype: float64
+
+        With DataFrame:
+
+        >>> df = pd.DataFrame({'A': [1, None, None, None, 4],
+        ...                    'B': [None, None, 5, None, 7]}, index=index)
+        >>> df
+                  A    B
+        Falcon  1.0  NaN
+        Falcon  NaN  NaN
+        Parrot  NaN  5.0
+        Parrot  NaN  NaN
+        Parrot  4.0  7.0
+        >>> df.groupby(level=0).bfill()
+                  A    B
+        Falcon  1.0  NaN
+        Falcon  NaN  NaN
+        Parrot  4.0  5.0
+        Parrot  4.0  7.0
+        Parrot  4.0  7.0
+        >>> df.groupby(level=0).bfill(limit=1)
+                  A    B
+        Falcon  1.0  NaN
+        Falcon  NaN  NaN
+        Parrot  NaN  5.0
+        Parrot  4.0  7.0
+        Parrot  4.0  7.0
+        """
 
     def prod():
         pass

diff --git a/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/groupby_overrides.py
@@ -434,7 +434,25 @@ def any(self, skipna: bool = True):
         )
 
     def bfill(self, limit=None):
-        ErrorMessage.method_not_implemented_error(name="bfill", class_="GroupBy")
+        is_series_groupby = self.ndim == 1
+
+        # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions
+        query_compiler = self._query_compiler.groupby_fillna(
+            self._by,
+            self._axis,
+            self._kwargs,
+            value=None,
+            method="bfill",
+            fill_axis=None,
+            inplace=False,
+            limit=limit,
+            downcast=None,
+        )
+        return (
+            pd.Series(query_compiler=query_compiler)
+            if is_series_groupby
+            else pd.DataFrame(query_compiler=query_compiler)
+        )
 
     def corr(self, **kwargs):
         # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions
@@ -507,7 +525,25 @@ def diff(self):
         ErrorMessage.method_not_implemented_error(name="diff", class_="GroupBy")
 
     def ffill(self, limit=None):
-        ErrorMessage.method_not_implemented_error(name="ffill", class_="GroupBy")
+        is_series_groupby = self.ndim == 1
+
+        # TODO: SNOW-1063349: Modin upgrade - modin.pandas.groupby.DataFrameGroupBy functions
+        query_compiler = self._query_compiler.groupby_fillna(
+            self._by,
+            self._axis,
+            self._kwargs,
+            value=None,
+            method="ffill",
+            fill_axis=None,
+            inplace=False,
+            limit=limit,
+            downcast=None,
+        )
+        return (
+            pd.Series(query_compiler=query_compiler)
+            if is_series_groupby
+            else pd.DataFrame(query_compiler=query_compiler)
+        )
 
     def fillna(
         self,