snowflakedb · sfc-gh-joshi · Jul 23, 2024 · Jul 23, 2024 · Jul 31, 2024 · Jul 31, 2024
@@ -173,6 +173,68 @@ jobs:
             .tox/.coverage
             .tox/coverage.xml
 
+  test-pandas-patch-versions:
+    name: Test Snowpark pandas with pandas ${{ matrix.pandas-version }}
+    needs: build
+    runs-on: ${{ matrix.os.image_name }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - image_name: ubuntu-latest-64-cores
+            download_name: linux
+        pandas-version: ["2.2.1", "2.2.2"]
+        python-version: ["3.9"]
+        cloud-provider: [aws]
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Display Python version
+        run: python -c "import sys; print(sys.version)"
+      - name: Decrypt parameters.py
+        shell: bash
+        run: .github/scripts/decrypt_parameters.sh
+        env:
+          PARAMETER_PASSWORD: ${{ secrets.PARAMETER_PASSWORD }}
+          CLOUD_PROVIDER: ${{ matrix.cloud-provider }}
+      - name: Download wheel(s)
+        uses: actions/download-artifact@v4
+        with:
+          name: wheel
+          path: dist
+      - name: Show wheels downloaded
+        run: ls -lh dist
+        shell: bash
+      - name: Upgrade setuptools, pip and wheel
+        run: python -m pip install -U setuptools pip wheel
+      - name: Install tox
+        run: python -m pip install tox
+      - if: ${{ contains('macos', matrix.os.download_name) }}
+        name: Run Snowpark pandas API doctests
+        run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci"
+        env:
+          MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          cloud_provider: ${{ matrix.cloud-provider }}
+          PYTEST_ADDOPTS: --color=yes --tb=short
+          TOX_PARALLEL_NO_SPINNER: 1
+          # Specify SNOWFLAKE_IS_PYTHON_RUNTIME_TEST: 1 when adding >= python3.11 with no server-side support
+          # For example, see https://github.com/snowflakedb/snowpark-python/pull/681
+        shell: bash
+      - name: Run Snowpark pandas API tests (excluding doctests)
+        run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci"
+        env:
+          MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }}
+          PYTHON_VERSION: ${{ matrix.python-version }}
+          cloud_provider: ${{ matrix.cloud-provider }}
+          PYTEST_ADDOPTS: --color=yes --tb=short
+          TOX_PARALLEL_NO_SPINNER: 1
+        shell: bash
+
   test-disable-sql-simplifier: # Will be removed after sql simplifier is stable and no option to opt out.
     name: Test Disable SQL Simplifier modin-${{ matrix.os.download_name }}-${{ matrix.python-version }}-${{ matrix.cloud-provider }}
     needs: build

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,11 @@
 
 ### Snowpark pandas API Updates
 
+### Dependency Updates
+
+- Updated `modin` from 0.28.1 to 0.30.1.
+- Added support for all `pandas` 2.2.x versions.
+
 #### New Features
 
 - Added support for `np.subtract`, `np.multiply`, `np.divide`, and `np.true_divide`.

@@ -35,7 +35,7 @@ cd snowpark-python
 
 - Create a new Python virtual environment with any Python version that we support.
   - The Snowpark Python API supports **Python 3.8, Python 3.9, Python 3.10, and Python 3.11**. 
-  - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.28.1** and **pandas 2.2.1**.
+  - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.30.1** and **pandas 2.2.x**.
 
     ```bash
     conda create --name snowpark-dev python=3.9

@@ -10,9 +10,7 @@
 THIS_DIR = os.path.dirname(os.path.realpath(__file__))
 SRC_DIR = os.path.join(THIS_DIR, "src")
 SNOWPARK_SRC_DIR = os.path.join(SRC_DIR, "snowflake", "snowpark")
-MODIN_DEPENDENCY_VERSION = (
-    "==0.28.1"  # Snowpark pandas requires modin 0.28.1, which depends on pandas 2.2.1
-)
+MODIN_DEPENDENCY_VERSION = "==0.30.1"  # Snowpark pandas requires modin 0.30.1, which is compatible with pandas 2.2.x
 CONNECTOR_DEPENDENCY_VERSION = ">=3.10.0, <4.0.0"
 INSTALL_REQ_LIST = [
     "setuptools>=40.6.0",

@@ -21,11 +21,17 @@
 # since modin may raise its own warnings/errors on the wrong pandas version
 import pandas  # isort: skip  # noqa: E402
 
-supported_pandas_version = "2.2.1"
-if pandas.__version__ != supported_pandas_version:
+# TODO SNOW-1758773: perform version check in modin instead
+supported_pandas_major_version = 2
+supported_pandas_minor_version = 2
+actual_pandas_version = version.parse(pandas.__version__)
+if (
+    actual_pandas_version.major != supported_pandas_major_version
+    and actual_pandas_version.minor != supported_pandas_minor_version
+):
     raise RuntimeError(
         f"The pandas version installed ({pandas.__version__}) does not match the supported pandas version in"
-        + f" Snowpark pandas ({supported_pandas_version}). "
+        + f" Snowpark pandas ({supported_pandas_major_version}.{supported_pandas_minor_version}.x). "
         + install_msg
     )  # pragma: no cover
 
@@ -36,7 +42,7 @@
         "Modin is not installed. " + install_msg
     )  # pragma: no cover
 
-supported_modin_version = "0.28.1"
+supported_modin_version = "0.30.1"
 if version.parse(modin.__version__) != version.parse(supported_modin_version):
     raise ImportError(
         f"The Modin version installed ({modin.__version__}) does not match the supported Modin version in"
@@ -136,25 +142,45 @@
     register_pd_accessor,
     register_series_accessor,
 )
+from modin.pandas.accessor import ModinAPI  # isort: skip  # noqa: E402,F401
 
 from snowflake.snowpark.modin.plugin._internal.telemetry import (  # isort: skip  # noqa: E402,F401
     TELEMETRY_PRIVATE_METHODS,
     snowpark_pandas_telemetry_standalone_function_decorator,
     try_add_telemetry_to_attribute,
 )
 
+# Add telemetry on the ModinAPI accessor object.
+# modin 0.30.1 introduces the pd.DataFrame.modin accessor object for non-pandas methods,
+# such as pd.DataFrame.modin.to_pandas and pd.DataFrame.modin.to_ray. We will automatically
+# raise NotImplementedError for all methods on this accessor object except to_pandas, but
+# we still want to record telemetry.
+for attr_name in dir(ModinAPI):
+    if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS:
+        setattr(
+            ModinAPI,
+            attr_name,
+            try_add_telemetry_to_attribute(attr_name, getattr(ModinAPI, attr_name)),
+        )
+
 for attr_name in dir(Series):
     # Since Series is defined in upstream Modin, all of its members were either defined upstream
     # or overridden by extension.
-    if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS:
+    # Skip the `modin` accessor object, since we apply telemetry to all its fields.
+    if attr_name != "modin" and (
+        not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS
+    ):
         register_series_accessor(attr_name)(
             try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name))
         )
 
 for attr_name in dir(DataFrame):
     # Since DataFrame is defined in upstream Modin, all of its members were either defined upstream
     # or overridden by extension.
-    if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS:
+    # Skip the `modin` accessor object, since we apply telemetry to all its fields.
+    if attr_name != "modin" and (
+        not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS
+    ):
         register_dataframe_accessor(attr_name)(
             try_add_telemetry_to_attribute(attr_name, getattr(DataFrame, attr_name))
         )

@@ -713,6 +713,7 @@ def aggregate():
     agg = aggregate
 
     def apply():
+        # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda
         """
         Apply a function along an axis of the DataFrame.
 
@@ -821,15 +822,15 @@ def apply():
 
         Using a reducing function on ``axis=1``:
 
-        >>> df.apply(np.sum, axis=1)
+        >>> df.apply(np.sum, axis=1)  # doctest: +SKIP
         0     2
         1    10
         2    13
         dtype: int64
 
         Returning a list-like object will result in a Series:
 
-        >>> df.apply(lambda x: [1, 2], axis=1)
+        >>> df.apply(lambda x: [1, 2], axis=1)  # doctest: +SKIP
         0    [1, 2]
         1    [1, 2]
         2    [1, 2]
@@ -1022,6 +1023,7 @@ def keys():
         """
 
     def transform():
+        # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda
         """
         Call ``func`` on self producing a Snowpark pandas DataFrame with the same axis shape as self.
 
@@ -1055,15 +1057,15 @@ def transform():
         0     1     3
         1     2     4
         2     3     5
-        >>> df.transform(lambda x: x + 1, axis=1)
+        >>> df.transform(lambda x: x + 1, axis=1)  # doctest: +SKIP
            col1  col2
         0     2     4
         1     3     5
         2     4     6
 
         Apply a numpy ufunc to every value in the DataFrame.
 
-        >>> df.transform(np.square, axis=1)
+        >>> df.transform(np.square, axis=1)  # doctest: +SKIP
            col1  col2
         0     1     9
         1     4    16

@@ -989,6 +989,7 @@ def cummax():
         """
 
     def apply():
+        # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda
         """
         Apply function ``func`` group-wise and combine the results together.
 
@@ -1050,7 +1051,7 @@ def apply():
         its argument and returns a DataFrame. `apply` combines the result for
         each group together into a new DataFrame:
 
-        >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE
+        >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP
                     B    C
         0.0  0.333333  0.4
         1.0  0.666667  0.6
@@ -1059,7 +1060,7 @@ def apply():
         In the above, the groups are not part of the index. We can have them included
         by using ``g2`` where ``group_keys=True``:
 
-        >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE
+        >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP
                     B    C
         A
         a 0.0  0.333333  0.4
@@ -1942,6 +1943,7 @@ def cov():
         pass
 
     def transform():
+        # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda
         """
         Call function producing a same-indexed DataFrame on each group.
 
@@ -2011,7 +2013,7 @@ def transform():
         i     X     9    90    -9
         j     Y    10    10   -10
 
-        >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2)
+        >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2)  # doctest: +SKIP
            col2  col3  col4
         a   1.0  40.0  -1.0
         b   NaN   NaN   NaN
@@ -2024,7 +2026,7 @@ def transform():
         i   NaN   NaN   NaN
         j  10.0  10.0 -10.0
 
-        >>> df.groupby("col1", dropna=False).transform("mean")
+        >>> df.groupby("col1", dropna=False).transform("mean")  # doctest: +SKIP
            col2  col3  col4
         a  2.50  25.0 -2.50
         b  5.00  65.0 -5.00

@@ -1552,6 +1552,37 @@ def __getitem__(self, key):
     return self.loc[:, key]
 
 
+# Modin uses the unique() query compiler method instead of aliasing the duplicated frontend method as of 0.30.1.
+# TODO SNOW-1758721: use the more efficient implementation
+@register_base_override("drop_duplicates")
+def drop_duplicates(
+    self, keep="first", inplace=False, **kwargs
+):  # noqa: PR01, RT01, D200
+    """
+    Return `BasePandasDataset` with duplicate rows removed.
+    """
+    inplace = validate_bool_kwarg(inplace, "inplace")
+    ignore_index = kwargs.get("ignore_index", False)
+    subset = kwargs.get("subset", None)
+    if subset is not None:
+        if is_list_like(subset):
+            if not isinstance(subset, list):
+                subset = list(subset)  # pragma: no cover
+        else:
+            subset = [subset]
+        df = self[subset]
+    else:
+        df = self
+    duplicated = df.duplicated(keep=keep)
+    result = self[~duplicated]
+    if ignore_index:
+        result.index = pandas.RangeIndex(stop=len(result))
+    if inplace:
+        self._update_inplace(result._query_compiler)  # pragma: no cover
+    else:
+        return result
+
+
 # Snowpark pandas does extra argument validation, which may need to be upstreamed.
 @register_base_override("sort_values")
 def sort_values(

@@ -1481,46 +1481,6 @@ def mask(
     )
 
 
-# Snowpark pandas has a fix for a pandas behavior change. It is available in Modin 0.30.1 (SNOW-1552497).
-@register_dataframe_accessor("melt")
-def melt(
-    self,
-    id_vars=None,
-    value_vars=None,
-    var_name=None,
-    value_name="value",
-    col_level=None,
-    ignore_index=True,
-):  # noqa: PR01, RT01, D200
-    """
-    Unpivot a ``DataFrame`` from wide to long format, optionally leaving identifiers set.
-    """
-    # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions
-    if id_vars is None:
-        id_vars = []
-    if not is_list_like(id_vars):
-        id_vars = [id_vars]
-    if value_vars is None:
-        # Behavior of Index.difference changed in 2.2.x
-        # https://github.com/pandas-dev/pandas/pull/55113
-        # This change needs upstream to Modin:
-        # https://github.com/modin-project/modin/issues/7206
-        value_vars = self.columns.drop(id_vars)
-    if var_name is None:
-        columns_name = self._query_compiler.get_index_name(axis=1)
-        var_name = columns_name if columns_name is not None else "variable"
-    return self.__constructor__(
-        query_compiler=self._query_compiler.melt(
-            id_vars=id_vars,
-            value_vars=value_vars,
-            var_name=var_name,
-            value_name=value_name,
-            col_level=col_level,
-            ignore_index=ignore_index,
-        )
-    )
-
-
 # Snowpark pandas does more thorough error checking.
 @register_dataframe_accessor("merge")
 def merge(

@@ -24,6 +24,7 @@
 from pandas._typing import (
     AggFuncType,
     AnyArrayLike,
+    ArrayLike,
     Axis,
     FillnaOptions,
     IgnoreRaise,
@@ -1596,6 +1597,19 @@ def rename(
             return self_cp
 
 
+# In some cases, modin after 0.30.1 returns a DatetimeArray instead of a numpy array. This
+# still differs from the expected pandas behavior, which would return DatetimeIndex
+# (see SNOW-1019312).
+@register_series_accessor("unique")
+def unique(self) -> ArrayLike:  # noqa: RT01, D200
+    """
+    Return unique values of Series object.
+    """
+    # `values` can't be used here because it performs unnecessary conversion,
+    # after which the result type does not match the pandas
+    return self.__constructor__(query_compiler=self._query_compiler.unique()).to_numpy()
+
+
 # Modin defaults to pandas for some arguments for unstack
 @register_series_accessor("unstack")
 def unstack(