From 31e5320adbda4d4c01fcde72364157a86cd4ad06 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 23 Jul 2024 11:20:27 -0700 Subject: [PATCH 01/26] bump modin --- CONTRIBUTING.md | 2 +- setup.py | 2 +- src/snowflake/snowpark/modin/plugin/__init__.py | 13 +++++++++---- .../modin/plugin/extensions/dataframe_overrides.py | 4 ---- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 66def5ea34..6e54296e9d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,7 +35,7 @@ cd snowpark-python - Create a new Python virtual environment with any Python version that we support. - The Snowpark Python API supports **Python 3.8, Python 3.9, Python 3.10, and Python 3.11**. - - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.28.1** and **pandas 2.2.1**. + - The Snowpark pandas API supports **Python 3.9, Python 3.10, and Python 3.11**. Additionally, Snowpark pandas requires **Modin 0.30.1** and **pandas 2.2.x**. ```bash conda create --name snowpark-dev python=3.9 diff --git a/setup.py b/setup.py index 8539d125d4..5f3272f03a 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ SRC_DIR = os.path.join(THIS_DIR, "src") SNOWPARK_SRC_DIR = os.path.join(SRC_DIR, "snowflake", "snowpark") MODIN_DEPENDENCY_VERSION = ( - "==0.28.1" # Snowpark pandas requires modin 0.28.1, which depends on pandas 2.2.1 + "==0.30.1" # Snowpark pandas requires modin 0.28.1, which depends on pandas 2.2.1 ) CONNECTOR_DEPENDENCY_VERSION = ">=3.10.0, <4.0.0" INSTALL_REQ_LIST = [ diff --git a/src/snowflake/snowpark/modin/plugin/__init__.py b/src/snowflake/snowpark/modin/plugin/__init__.py index 2a277bde92..381cc8b2ab 100644 --- a/src/snowflake/snowpark/modin/plugin/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/__init__.py @@ -21,11 +21,16 @@ # since modin may raise its own warnings/errors on the wrong pandas version import pandas # isort: skip # noqa: E402 -supported_pandas_version = "2.2.1" -if pandas.__version__ != supported_pandas_version: +supported_pandas_major_version = 2 +supported_pandas_minor_version = 2 +actual_pandas_version = version.parse(pandas.__version__) +if ( + actual_pandas_version.major != supported_pandas_major_version + and actual_pandas_version.minor != supported_pandas_minor_version +): raise RuntimeError( f"The pandas version installed ({pandas.__version__}) does not match the supported pandas version in" - + f" Snowpark pandas ({supported_pandas_version}). " + + f" Snowpark pandas ({supported_pandas_major_version}.{supported_pandas_minor_version}.x). " + install_msg ) # pragma: no cover @@ -36,7 +41,7 @@ "Modin is not installed. " + install_msg ) # pragma: no cover -supported_modin_version = "0.28.1" +supported_modin_version = "0.30.1" if version.parse(modin.__version__) != version.parse(supported_modin_version): raise ImportError( f"The Modin version installed ({modin.__version__}) does not match the supported Modin version in" diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 792217c3b3..3d8aecbad2 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -1501,10 +1501,6 @@ def melt( if not is_list_like(id_vars): id_vars = [id_vars] if value_vars is None: - # Behavior of Index.difference changed in 2.2.x - # https://github.com/pandas-dev/pandas/pull/55113 - # This change needs upstream to Modin: - # https://github.com/modin-project/modin/issues/7206 value_vars = self.columns.drop(id_vars) if var_name is None: columns_name = self._query_compiler.get_index_name(axis=1) From 845e7b446cabff5ae4e5c0f785b60c451839a5c8 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 23 Jul 2024 11:24:53 -0700 Subject: [PATCH 02/26] changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d890ef532..8076dc8232 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,6 +91,11 @@ ### Snowpark pandas API Updates +### Dependency Updates + +- Updated `modin` from 0.28.1 to 0.30.1. +- Added support for `pandas` 2.2.2. + #### New Features - Added support for `TimedeltaIndex.mean` method. From 8995a8fdb803967a67dbc5dc728c472a93a2264a Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 31 Jul 2024 11:58:41 -0500 Subject: [PATCH 03/26] Fix issue Signed-off-by: Devin Petersohn --- .../compiler/snowflake_query_compiler.py | 110 ++++++++++-------- 1 file changed, 63 insertions(+), 47 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 8483a41458..a6442fa353 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -1097,9 +1097,11 @@ def find_snowflake_quoted_identifier(pandas_columns: list[str]) -> list[str]: ordered_dataframe = ordered_dataframe.select( [row_position_snowflake_quoted_identifier] + [ - old_identifier - if old_identifier == new_identifier - else col(old_identifier).as_(new_identifier) + ( + old_identifier + if old_identifier == new_identifier + else col(old_identifier).as_(new_identifier) + ) for old_identifier, new_identifier in zip( snowflake_quoted_identifiers_to_be_selected, snowflake_quoted_identifiers_to_be_renamed, @@ -6050,7 +6052,6 @@ def get_dummies( dtype: Optional[npt.DTypeLike] = None, is_series: bool = False, ) -> "SnowflakeQueryCompiler": - """ Implement one-hot encoding. Args: @@ -6121,7 +6122,7 @@ def get_dummies( prefix_sep = "_" query_compiler = self - for (pandas_column_name, column_prefix) in zip(columns, prefix): + for pandas_column_name, column_prefix in zip(columns, prefix): query_compiler = query_compiler._get_dummies_helper( pandas_column_name, column_prefix, @@ -8212,9 +8213,11 @@ def _apply_with_udtf_and_dynamic_pivot_along_axis_1( row_position_snowflake_quoted_identifier, *[ # casting if return type is specified - col(old_quoted_identifier).cast(return_type).as_(quoted_identifier) - if not return_variant - else col(old_quoted_identifier).as_(quoted_identifier) + ( + col(old_quoted_identifier).cast(return_type).as_(quoted_identifier) + if not return_variant + else col(old_quoted_identifier).as_(quoted_identifier) + ) for old_quoted_identifier, quoted_identifier in zip( data_column_snowflake_quoted_identifiers, renamed_data_column_snowflake_quoted_identifiers, @@ -9655,9 +9658,11 @@ def take_2d_labels( get_frame_by_col_label( get_frame_by_row_label( internal_frame=self._modin_frame, - key=index._modin_frame - if isinstance(index, SnowflakeQueryCompiler) - else index, + key=( + index._modin_frame + if isinstance(index, SnowflakeQueryCompiler) + else index + ), ), columns, ) @@ -10139,13 +10144,15 @@ def set_2d_labels( result_frame = set_frame_2d_labels( internal_frame=self._modin_frame, - index=index._modin_frame - if isinstance(index, SnowflakeQueryCompiler) - else index, + index=( + index._modin_frame + if isinstance(index, SnowflakeQueryCompiler) + else index + ), columns=columns, - item=item._modin_frame - if isinstance(item, SnowflakeQueryCompiler) - else item, + item=( + item._modin_frame if isinstance(item, SnowflakeQueryCompiler) else item + ), matching_item_columns_by_label=matching_item_columns_by_label, matching_item_rows_by_label=matching_item_rows_by_label, index_is_bool_indexer=index_is_bool_indexer, @@ -13253,9 +13260,11 @@ def _quantiles_single_col( col_identifier, index_identifier, new_identifiers, - col_mapper=dict(zip(new_identifiers, index)) - if index is not None - else dict(zip(new_identifiers, new_labels)), + col_mapper=( + dict(zip(new_identifiers, index)) + if index is not None + else dict(zip(new_identifiers, new_labels)) + ), ) col_after_null_replace_identifier = ( ordered_dataframe.generate_snowflake_quoted_identifiers( @@ -18196,20 +18205,24 @@ def pct_change( { quoted_identifier: # If periods=0, we don't need to do any window computation - iff( - is_null(col(quoted_identifier)), - pandas_lit(None, FloatType()), - pandas_lit(0), - ) - if periods == 0 - else ( - col(quoted_identifier) - / lag(quoted_identifier, offset=periods).over( - Window.orderBy( - col(frame.row_position_snowflake_quoted_identifier) + ( + iff( + is_null(col(quoted_identifier)), + pandas_lit(None, FloatType()), + pandas_lit(0), + ) + if periods == 0 + else ( + col(quoted_identifier) + / lag(quoted_identifier, offset=periods).over( + Window.orderBy( + col( + frame.row_position_snowflake_quoted_identifier + ) + ) ) + - 1 ) - - 1 ) for quoted_identifier in frame.data_column_snowflake_quoted_identifiers } @@ -18222,21 +18235,24 @@ def pct_change( { quoted_identifier: # If periods=0, we don't need to do any computation - iff( - is_null(col(quoted_identifier)), - pandas_lit(None, FloatType()), - pandas_lit(0), - ) - if periods == 0 - else ( - # If periods>0, the first few columns will be NULL - # If periods<0, the last few columns will be NULL - pandas_lit(None, FloatType()) - if i - periods < 0 or i - periods >= len(quoted_identifiers) - # For the remaining columns, if periods=n, we compare column i to column i+n - else col(quoted_identifier) - / col(quoted_identifiers[i - periods]) - - 1 + ( + iff( + is_null(col(quoted_identifier)), + pandas_lit(None, FloatType()), + pandas_lit(0), + ) + if periods == 0 + else ( + # If periods>0, the first few columns will be NULL + # If periods<0, the last few columns will be NULL + pandas_lit(None, FloatType()) + if i - periods < 0 + or i - periods >= len(quoted_identifiers) + # For the remaining columns, if periods=n, we compare column i to column i+n + else col(quoted_identifier) + / col(quoted_identifiers[i - periods]) + - 1 + ) ) for i, quoted_identifier in enumerate(quoted_identifiers) } From 3bcad42c5dc1f4632f0e6a363851ff212ab8d56d Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 31 Jul 2024 13:09:14 -0500 Subject: [PATCH 04/26] Fix issue Signed-off-by: Devin Petersohn --- tests/integ/modin/series/test_loc.py | 38 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 3826282e0d..5461b3faa6 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -142,9 +142,11 @@ def test_series_loc_get_negative_snowpark_pandas_input( eval_snowpark_pandas_result( str_index_snowpark_pandas_series, str_index_native_series, - lambda df: df.loc[negative_loc_snowpark_pandas_input_map[key][0]] - if isinstance(df, Series) - else df.loc[negative_loc_snowpark_pandas_input_map[key][1]], + lambda df: ( + df.loc[negative_loc_snowpark_pandas_input_map[key][0]] + if isinstance(df, Series) + else df.loc[negative_loc_snowpark_pandas_input_map[key][1]] + ), expect_exception=True, ) @@ -236,9 +238,11 @@ def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_in eval_snowpark_pandas_result( snow_series, native_series, - lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=index, dtype="bool")], + lambda s: ( + s.loc[pd.Series(key, index=index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=index, dtype="bool")] + ), ) @@ -272,9 +276,11 @@ def test_series_loc_get_key_bool_series_with_unaligned_and_distinct_indices( eval_snowpark_pandas_result( snow_series, native_series, - lambda s: s.loc[pd.Series(key, index=key_index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=native_key_index, dtype="bool")], + lambda s: ( + s.loc[pd.Series(key, index=key_index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=native_key_index, dtype="bool")] + ), ) @@ -416,9 +422,11 @@ def test_series_loc_get_key_bool_series_with_mismatch_index_len(key, use_default eval_snowpark_pandas_result( snow_series, native_series, - lambda df: df.loc[series_key] - if isinstance(df, pd.DataFrame) - else df.loc[native_series_key], + lambda df: ( + df.loc[series_key] + if isinstance(df, pd.DataFrame) + else df.loc[native_series_key] + ), ) @@ -1437,7 +1445,7 @@ def test_series_loc_set_boolean_key(key, index): assert_series_equal(snow_ser, native_ser, check_dtype=False) -@sql_count_checker(query_count=0, join_count=0) +@sql_count_checker(query_count=2, join_count=0) @pytest.mark.parametrize("item", [1.2, None, ["a", "b", "c"]]) def test_series_loc_set_df_key_negative(item, default_index_native_series): # series.loc[df key] = any item @@ -1454,7 +1462,9 @@ def test_series_loc_set_df_key_negative(item, default_index_native_series): native_ser.loc[df_key] = item # Snowpark pandas error verification. - err_msg = "Data cannot be a DataFrame" + err_msg = re.escape( + "Data must be 1-dimensional, got ndarray of shape (1, 2) instead" + ) with pytest.raises(ValueError, match=err_msg): snowpark_ser.loc[pd.DataFrame(df_key)] = item assert_series_equal(snowpark_ser, native_ser) From bd497812d5e758d7a0379dbd50a34012ba10a2eb Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Fri, 16 Aug 2024 12:54:20 -0700 Subject: [PATCH 05/26] add pandas version test to daily job --- .github/workflows/daily_modin_precommit.yml | 62 +++++++++++++++++++++ setup.py | 4 +- tox.ini | 2 + 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index 8871a0daec..9fa83d2234 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -173,6 +173,68 @@ jobs: .tox/.coverage .tox/coverage.xml + test-pandas-patch-versions: + name: Test Snowpark pandas with pandas ${{ matrix.python-version }} + needs: build + runs-on: ${{ matrix.os.image_name }} + strategy: + fail-fast: false + matrix: + os: + - image_name: ubuntu-latest-64-cores + download_name: linux + pandas-version: ["2.2.1", "2.2.2"] + python-version: ["3.9"] + cloud-provider: [aws] + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Display Python version + run: python -c "import sys; print(sys.version)" + - name: Decrypt parameters.py + shell: bash + run: .github/scripts/decrypt_parameters.sh + env: + PARAMETER_PASSWORD: ${{ secrets.PARAMETER_PASSWORD }} + CLOUD_PROVIDER: ${{ matrix.cloud-provider }} + - name: Download wheel(s) + uses: actions/download-artifact@v4 + with: + name: wheel + path: dist + - name: Show wheels downloaded + run: ls -lh dist + shell: bash + - name: Upgrade setuptools, pip and wheel + run: python -m pip install -U setuptools pip wheel + - name: Install tox + run: python -m pip install tox + - if: ${{ contains('macos', matrix.os.download_name) }} + name: Run Snowpark pandas API doctests + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + env: + MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }} + PYTHON_VERSION: ${{ matrix.python-version }} + cloud_provider: ${{ matrix.cloud-provider }} + PYTEST_ADDOPTS: --color=yes --tb=short + TOX_PARALLEL_NO_SPINNER: 1 + # Specify SNOWFLAKE_IS_PYTHON_RUNTIME_TEST: 1 when adding >= python3.11 with no server-side support + # For example, see https://github.com/snowflakedb/snowpark-python/pull/681 + shell: bash + - name: Run Snowpark pandas API tests (excluding doctests) + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" + env: + MODIN_PANDAS_PATCH_VERSION: ${{ matrix.pandas-version }} + PYTHON_VERSION: ${{ matrix.python-version }} + cloud_provider: ${{ matrix.cloud-provider }} + PYTEST_ADDOPTS: --color=yes --tb=short + TOX_PARALLEL_NO_SPINNER: 1 + shell: bash + test-disable-sql-simplifier: # Will be removed after sql simplifier is stable and no option to opt out. name: Test Disable SQL Simplifier modin-${{ matrix.os.download_name }}-${{ matrix.python-version }}-${{ matrix.cloud-provider }} needs: build diff --git a/setup.py b/setup.py index 5f3272f03a..81f937c007 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,7 @@ THIS_DIR = os.path.dirname(os.path.realpath(__file__)) SRC_DIR = os.path.join(THIS_DIR, "src") SNOWPARK_SRC_DIR = os.path.join(SRC_DIR, "snowflake", "snowpark") -MODIN_DEPENDENCY_VERSION = ( - "==0.30.1" # Snowpark pandas requires modin 0.28.1, which depends on pandas 2.2.1 -) +MODIN_DEPENDENCY_VERSION = "==0.30.1" # Snowpark pandas requires modin 0.30.1, which is compatible with pandas 2.2.x CONNECTOR_DEPENDENCY_VERSION = ">=3.10.0, <4.0.0" INSTALL_REQ_LIST = [ "setuptools>=40.6.0", diff --git a/tox.ini b/tox.ini index 111e344ac7..d98533795e 100644 --- a/tox.ini +++ b/tox.ini @@ -42,6 +42,7 @@ deps = .[development] .[opentelemetry] {env:SNOWFLAKE_PYTEST_MODIN_DEPS} + {env:SNOWFLAKE_PYTEST_PANDAS_DEPS} install_command = bash ./scripts/tox_install_cmd.sh {opts} {packages} setenv = COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} @@ -66,6 +67,7 @@ setenv = # This configures the extra dependency required by modin test modin: SNOWFLAKE_PYTEST_MODIN_DEPS = [modin-development] SNOW_1314507_WORKAROUND_RERUN_FLAGS = --reruns 5 --reruns-delay 1 --only-rerun "Insufficient resource during interleaved execution." + modin_pandas_version: SNOWFLAKE_PYTEST_PANDAS_DEPS = pandas=={env:MODIN_PANDAS_PATCH_VERSION} MODIN_PYTEST_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources MODIN_PYTEST_DAILY_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_DAILY_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources MODIN_PYTEST_NO_COV_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_PARALLELISM:} --ignore=tests/resources From 900d4fd851de06d0772e0e7ffd003c028ef388a4 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Mon, 19 Aug 2024 11:37:02 -0700 Subject: [PATCH 06/26] try to add updated sproc dependencies --- tests/integ/modin/test_modin_stored_procedures.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index ec6d6c2e58..4f2ae12b24 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -8,14 +8,15 @@ from snowflake.snowpark import Session from snowflake.snowpark.functions import sproc from snowflake.snowpark.modin.plugin import ( + actual_pandas_version, supported_modin_version, - supported_pandas_version, ) from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import multithreaded_run PACKAGE_LIST = [ - f"pandas=={supported_pandas_version}", + # modin 0.30.1 supports any pandas 2.2.x, so just pick whichever one is installed in the client + f"pandas=={actual_pandas_version}", f"modin=={supported_modin_version}", "snowflake-snowpark-python", "numpy", From 4ab25c922f5f531948670b74f8da0119eb7bbc33 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 1 Oct 2024 14:48:44 -0700 Subject: [PATCH 07/26] add 2.2.3 to matrix --- .github/workflows/daily_modin_precommit.yml | 2 +- CHANGELOG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index 9fa83d2234..3059ea0d3a 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -183,7 +183,7 @@ jobs: os: - image_name: ubuntu-latest-64-cores download_name: linux - pandas-version: ["2.2.1", "2.2.2"] + pandas-version: ["2.2.1", "2.2.2", "2.2.3"] python-version: ["3.9"] cloud-provider: [aws] steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 8076dc8232..d794cbc810 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,7 +94,7 @@ ### Dependency Updates - Updated `modin` from 0.28.1 to 0.30.1. -- Added support for `pandas` 2.2.2. +- Added support for `pandas` 2.2.2 and 2.2.3. #### New Features From 00de3afc04b9796208a705dfdb45c89de83b843e Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 1 Oct 2024 15:38:41 -0700 Subject: [PATCH 08/26] revert series/test_loc and query compiler --- .../compiler/snowflake_query_compiler.py | 110 ++++++++---------- tests/integ/modin/series/test_loc.py | 38 +++--- 2 files changed, 61 insertions(+), 87 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index a6442fa353..8483a41458 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -1097,11 +1097,9 @@ def find_snowflake_quoted_identifier(pandas_columns: list[str]) -> list[str]: ordered_dataframe = ordered_dataframe.select( [row_position_snowflake_quoted_identifier] + [ - ( - old_identifier - if old_identifier == new_identifier - else col(old_identifier).as_(new_identifier) - ) + old_identifier + if old_identifier == new_identifier + else col(old_identifier).as_(new_identifier) for old_identifier, new_identifier in zip( snowflake_quoted_identifiers_to_be_selected, snowflake_quoted_identifiers_to_be_renamed, @@ -6052,6 +6050,7 @@ def get_dummies( dtype: Optional[npt.DTypeLike] = None, is_series: bool = False, ) -> "SnowflakeQueryCompiler": + """ Implement one-hot encoding. Args: @@ -6122,7 +6121,7 @@ def get_dummies( prefix_sep = "_" query_compiler = self - for pandas_column_name, column_prefix in zip(columns, prefix): + for (pandas_column_name, column_prefix) in zip(columns, prefix): query_compiler = query_compiler._get_dummies_helper( pandas_column_name, column_prefix, @@ -8213,11 +8212,9 @@ def _apply_with_udtf_and_dynamic_pivot_along_axis_1( row_position_snowflake_quoted_identifier, *[ # casting if return type is specified - ( - col(old_quoted_identifier).cast(return_type).as_(quoted_identifier) - if not return_variant - else col(old_quoted_identifier).as_(quoted_identifier) - ) + col(old_quoted_identifier).cast(return_type).as_(quoted_identifier) + if not return_variant + else col(old_quoted_identifier).as_(quoted_identifier) for old_quoted_identifier, quoted_identifier in zip( data_column_snowflake_quoted_identifiers, renamed_data_column_snowflake_quoted_identifiers, @@ -9658,11 +9655,9 @@ def take_2d_labels( get_frame_by_col_label( get_frame_by_row_label( internal_frame=self._modin_frame, - key=( - index._modin_frame - if isinstance(index, SnowflakeQueryCompiler) - else index - ), + key=index._modin_frame + if isinstance(index, SnowflakeQueryCompiler) + else index, ), columns, ) @@ -10144,15 +10139,13 @@ def set_2d_labels( result_frame = set_frame_2d_labels( internal_frame=self._modin_frame, - index=( - index._modin_frame - if isinstance(index, SnowflakeQueryCompiler) - else index - ), + index=index._modin_frame + if isinstance(index, SnowflakeQueryCompiler) + else index, columns=columns, - item=( - item._modin_frame if isinstance(item, SnowflakeQueryCompiler) else item - ), + item=item._modin_frame + if isinstance(item, SnowflakeQueryCompiler) + else item, matching_item_columns_by_label=matching_item_columns_by_label, matching_item_rows_by_label=matching_item_rows_by_label, index_is_bool_indexer=index_is_bool_indexer, @@ -13260,11 +13253,9 @@ def _quantiles_single_col( col_identifier, index_identifier, new_identifiers, - col_mapper=( - dict(zip(new_identifiers, index)) - if index is not None - else dict(zip(new_identifiers, new_labels)) - ), + col_mapper=dict(zip(new_identifiers, index)) + if index is not None + else dict(zip(new_identifiers, new_labels)), ) col_after_null_replace_identifier = ( ordered_dataframe.generate_snowflake_quoted_identifiers( @@ -18205,24 +18196,20 @@ def pct_change( { quoted_identifier: # If periods=0, we don't need to do any window computation - ( - iff( - is_null(col(quoted_identifier)), - pandas_lit(None, FloatType()), - pandas_lit(0), - ) - if periods == 0 - else ( - col(quoted_identifier) - / lag(quoted_identifier, offset=periods).over( - Window.orderBy( - col( - frame.row_position_snowflake_quoted_identifier - ) - ) + iff( + is_null(col(quoted_identifier)), + pandas_lit(None, FloatType()), + pandas_lit(0), + ) + if periods == 0 + else ( + col(quoted_identifier) + / lag(quoted_identifier, offset=periods).over( + Window.orderBy( + col(frame.row_position_snowflake_quoted_identifier) ) - - 1 ) + - 1 ) for quoted_identifier in frame.data_column_snowflake_quoted_identifiers } @@ -18235,24 +18222,21 @@ def pct_change( { quoted_identifier: # If periods=0, we don't need to do any computation - ( - iff( - is_null(col(quoted_identifier)), - pandas_lit(None, FloatType()), - pandas_lit(0), - ) - if periods == 0 - else ( - # If periods>0, the first few columns will be NULL - # If periods<0, the last few columns will be NULL - pandas_lit(None, FloatType()) - if i - periods < 0 - or i - periods >= len(quoted_identifiers) - # For the remaining columns, if periods=n, we compare column i to column i+n - else col(quoted_identifier) - / col(quoted_identifiers[i - periods]) - - 1 - ) + iff( + is_null(col(quoted_identifier)), + pandas_lit(None, FloatType()), + pandas_lit(0), + ) + if periods == 0 + else ( + # If periods>0, the first few columns will be NULL + # If periods<0, the last few columns will be NULL + pandas_lit(None, FloatType()) + if i - periods < 0 or i - periods >= len(quoted_identifiers) + # For the remaining columns, if periods=n, we compare column i to column i+n + else col(quoted_identifier) + / col(quoted_identifiers[i - periods]) + - 1 ) for i, quoted_identifier in enumerate(quoted_identifiers) } diff --git a/tests/integ/modin/series/test_loc.py b/tests/integ/modin/series/test_loc.py index 5461b3faa6..3826282e0d 100644 --- a/tests/integ/modin/series/test_loc.py +++ b/tests/integ/modin/series/test_loc.py @@ -142,11 +142,9 @@ def test_series_loc_get_negative_snowpark_pandas_input( eval_snowpark_pandas_result( str_index_snowpark_pandas_series, str_index_native_series, - lambda df: ( - df.loc[negative_loc_snowpark_pandas_input_map[key][0]] - if isinstance(df, Series) - else df.loc[negative_loc_snowpark_pandas_input_map[key][1]] - ), + lambda df: df.loc[negative_loc_snowpark_pandas_input_map[key][0]] + if isinstance(df, Series) + else df.loc[negative_loc_snowpark_pandas_input_map[key][1]], expect_exception=True, ) @@ -238,11 +236,9 @@ def test_series_loc_get_key_bool_series_with_aligned_indices(key, use_default_in eval_snowpark_pandas_result( snow_series, native_series, - lambda s: ( - s.loc[pd.Series(key, index=index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=index, dtype="bool")] - ), + lambda s: s.loc[pd.Series(key, index=index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=index, dtype="bool")], ) @@ -276,11 +272,9 @@ def test_series_loc_get_key_bool_series_with_unaligned_and_distinct_indices( eval_snowpark_pandas_result( snow_series, native_series, - lambda s: ( - s.loc[pd.Series(key, index=key_index, dtype="bool")] - if isinstance(s, pd.Series) - else s.loc[native_pd.Series(key, index=native_key_index, dtype="bool")] - ), + lambda s: s.loc[pd.Series(key, index=key_index, dtype="bool")] + if isinstance(s, pd.Series) + else s.loc[native_pd.Series(key, index=native_key_index, dtype="bool")], ) @@ -422,11 +416,9 @@ def test_series_loc_get_key_bool_series_with_mismatch_index_len(key, use_default eval_snowpark_pandas_result( snow_series, native_series, - lambda df: ( - df.loc[series_key] - if isinstance(df, pd.DataFrame) - else df.loc[native_series_key] - ), + lambda df: df.loc[series_key] + if isinstance(df, pd.DataFrame) + else df.loc[native_series_key], ) @@ -1445,7 +1437,7 @@ def test_series_loc_set_boolean_key(key, index): assert_series_equal(snow_ser, native_ser, check_dtype=False) -@sql_count_checker(query_count=2, join_count=0) +@sql_count_checker(query_count=0, join_count=0) @pytest.mark.parametrize("item", [1.2, None, ["a", "b", "c"]]) def test_series_loc_set_df_key_negative(item, default_index_native_series): # series.loc[df key] = any item @@ -1462,9 +1454,7 @@ def test_series_loc_set_df_key_negative(item, default_index_native_series): native_ser.loc[df_key] = item # Snowpark pandas error verification. - err_msg = re.escape( - "Data must be 1-dimensional, got ndarray of shape (1, 2) instead" - ) + err_msg = "Data cannot be a DataFrame" with pytest.raises(ValueError, match=err_msg): snowpark_ser.loc[pd.DataFrame(df_key)] = item assert_series_equal(snowpark_ser, native_ser) From d5cff740c975cac493fd4a5ea781e888a7c56826 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 2 Oct 2024 16:49:40 -0700 Subject: [PATCH 09/26] fix drop_duplicates and unique --- .../snowpark/modin/plugin/__init__.py | 20 +++++++++++-- .../modin/plugin/extensions/base_overrides.py | 30 +++++++++++++++++++ tests/integ/test_df_to_snowpark_pandas.py | 28 ++++++++--------- 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/__init__.py b/src/snowflake/snowpark/modin/plugin/__init__.py index 381cc8b2ab..c5ea012a25 100644 --- a/src/snowflake/snowpark/modin/plugin/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/__init__.py @@ -141,6 +141,7 @@ register_pd_accessor, register_series_accessor, ) +from modin.pandas.accessor import ModinAPI # isort: skip # noqa: E402,F401 from snowflake.snowpark.modin.plugin._internal.telemetry import ( # isort: skip # noqa: E402,F401 TELEMETRY_PRIVATE_METHODS, @@ -148,10 +149,22 @@ try_add_telemetry_to_attribute, ) +# Add telemetry on the ModinAPI accessor object +for attr_name in dir(ModinAPI): + if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + setattr( + ModinAPI, + attr_name, + try_add_telemetry_to_attribute(attr_name, getattr(ModinAPI, attr_name)), + ) + for attr_name in dir(Series): # Since Series is defined in upstream Modin, all of its members were either defined upstream # or overridden by extension. - if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + # Skip the `modin` accessor object, since we apply telemetry to all its fields. + if attr_name != "modin" and ( + not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS + ): register_series_accessor(attr_name)( try_add_telemetry_to_attribute(attr_name, getattr(Series, attr_name)) ) @@ -159,7 +172,10 @@ for attr_name in dir(DataFrame): # Since DataFrame is defined in upstream Modin, all of its members were either defined upstream # or overridden by extension. - if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: + # Skip the `modin` accessor object, since we apply telemetry to all its fields. + if attr_name != "modin" and ( + not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS + ): register_dataframe_accessor(attr_name)( try_add_telemetry_to_attribute(attr_name, getattr(DataFrame, attr_name)) ) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index e2633f7eeb..e26986270e 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -1552,6 +1552,36 @@ def __getitem__(self, key): return self.loc[:, key] +# Modin uses the unique() query compiler method as of 0.30.1. +@register_base_override("drop_duplicates") +def drop_duplicates( + self, keep="first", inplace=False, **kwargs +): # noqa: PR01, RT01, D200 + """ + Return `BasePandasDataset` with duplicate rows removed. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = kwargs.get("ignore_index", False) + subset = kwargs.get("subset", None) + if subset is not None: + if is_list_like(subset): + if not isinstance(subset, list): + subset = list(subset) + else: + subset = [subset] + df = self[subset] + else: + df = self + duplicated = df.duplicated(keep=keep) + result = self[~duplicated] + if ignore_index: + result.index = pandas.RangeIndex(stop=len(result)) + if inplace: + self._update_inplace(result._query_compiler) + else: + return result + + # Snowpark pandas does extra argument validation, which may need to be upstreamed. @register_base_override("sort_values") def sort_values( diff --git a/tests/integ/test_df_to_snowpark_pandas.py b/tests/integ/test_df_to_snowpark_pandas.py index ede9b10e85..9e5ad8a7f0 100644 --- a/tests/integ/test_df_to_snowpark_pandas.py +++ b/tests/integ/test_df_to_snowpark_pandas.py @@ -43,21 +43,19 @@ def test_to_snowpark_pandas_no_modin(session, tmp_table_basic): try: import modin # noqa: F401 except ModuleNotFoundError: - # Current Snowpark Python installs pandas==2.2.2, but Snowpark pandas depends on modin - # 0.28.1, which needs pandas==2.2.1. The pandas version check is currently performed - # before Snowpark pandas checks whether modin is installed. - # TODO: SNOW-1552497: after upgrading to modin 0.30.1, Snowpark pandas will support - # all pandas 2.2.x, and this function call will raise a ModuleNotFoundError since - # modin is not installed. - match = ( - "Snowpark pandas does not support Python 3.8. Please update to Python 3.9 or later" - if sys.version_info.major == 3 and sys.version_info.minor == 8 - else "does not match the supported pandas version in Snowpark pandas" - ) - with pytest.raises( - RuntimeError, - match=match, - ): + if sys.version_info.major == 3 and sys.version_info.minor == 8: + # Snowpark pandas does not support Python 3.8 + ctx = pytest.raises( + RuntimeError, + match="Snowpark pandas does not support Python 3.8. Please update to Python 3.9 or later", + ) + else: + # This function call will raise a ModuleNotFoundError since modin is not installed + ctx = pytest.raises( + ModuleNotFoundError, + match="Modin is not installed.", + ) + with ctx: snowpark_df.to_snowpark_pandas() else: snowpark_df.to_snowpark_pandas() # should have no errors From 7cf2d3424725b999aa037cd619468a3c13646292 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 3 Oct 2024 13:38:14 -0700 Subject: [PATCH 10/26] pin pandas version --- .github/workflows/daily_modin_precommit.yml | 4 ++-- .github/workflows/precommit.yml | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index 3059ea0d3a..df9af37d89 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -174,7 +174,7 @@ jobs: .tox/coverage.xml test-pandas-patch-versions: - name: Test Snowpark pandas with pandas ${{ matrix.python-version }} + name: Test Snowpark pandas with pandas ${{ matrix.pandas-version }} needs: build runs-on: ${{ matrix.os.image_name }} strategy: @@ -183,7 +183,7 @@ jobs: os: - image_name: ubuntu-latest-64-cores download_name: linux - pandas-version: ["2.2.1", "2.2.2", "2.2.3"] + pandas-version: ["2.2.1", "2.2.2"] # TODO include 2.2.3 once it's available in snowflake anaconda python-version: ["3.9"] cloud-provider: [aws] steps: diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index 40776bf984..8e5c0a406a 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -382,8 +382,9 @@ jobs: # only run doctest for macos on aws - if: ${{ matrix.os == 'macos-latest' && matrix.cloud-provider == 'aws' }} name: Run Snowpark pandas API doctests - run: python -m tox -e "py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" env: + MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda PYTHON_VERSION: ${{ matrix.python-version }} cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short @@ -394,8 +395,9 @@ jobs: # do not run other tests for macos on aws - if: ${{ !(matrix.os == 'macos-latest' && matrix.cloud-provider == 'aws') }} name: Run Snowpark pandas API tests (excluding doctests) - run: python -m tox -e "py${PYTHON_VERSION/\./}-snowparkpandasnotdoctest-modin-ci" + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasnotdoctest-modin-ci" env: + MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda PYTHON_VERSION: ${{ matrix.python-version }} cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short From f64e5f94837bbc43cfdd2935c3bcd1d15f151df2 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Fri, 4 Oct 2024 13:22:10 -0700 Subject: [PATCH 11/26] override series.unique --- .../modin/plugin/extensions/series_overrides.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py index 0c531cc4f5..ce9266d0e1 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/series_overrides.py @@ -24,6 +24,7 @@ from pandas._typing import ( AggFuncType, AnyArrayLike, + ArrayLike, Axis, FillnaOptions, IgnoreRaise, @@ -1596,6 +1597,19 @@ def rename( return self_cp +# In some cases, modin after 0.30.1 returns a DatetimeArray instead of a numpy array. This +# still differs from the expected pandas behavior, which would return DatetimeIndex +# (see SNOW-1019312). +@register_series_accessor("unique") +def unique(self) -> ArrayLike: # noqa: RT01, D200 + """ + Return unique values of Series object. + """ + # `values` can't be used here because it performs unnecessary conversion, + # after which the result type does not match the pandas + return self.__constructor__(query_compiler=self._query_compiler.unique()).to_numpy() + + # Modin defaults to pandas for some arguments for unstack @register_series_accessor("unstack") def unstack( From b45d1f9a30e1f0803a5f96633454f2452dec90f1 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Mon, 7 Oct 2024 16:12:32 -0700 Subject: [PATCH 12/26] branch error messages on pandas version --- tests/integ/modin/frame/test_apply_axis_0.py | 12 +++++++++--- .../index/test_df_series_creation_with_index.py | 14 +++++++++----- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/integ/modin/frame/test_apply_axis_0.py b/tests/integ/modin/frame/test_apply_axis_0.py index 2edafc6b83..33db3c2220 100644 --- a/tests/integ/modin/frame/test_apply_axis_0.py +++ b/tests/integ/modin/frame/test_apply_axis_0.py @@ -3,11 +3,13 @@ # import datetime +import re import modin.pandas as pd import numpy as np import pandas as native_pd import pytest +from packaging.version import Version from pytest import param import snowflake.snowpark.modin.plugin # noqa: F401 @@ -220,9 +222,13 @@ def test_axis_0_return_dataframe_not_supported(): # Note that pands returns failure "ValueError: If using all scalar values, you must pass an index" which # doesn't explain this isn't supported. We go with the default returned by pandas in this case. - with pytest.raises( - SnowparkSQLException, match="The truth value of a DataFrame is ambiguous." - ): + if Version(native_pd.__version__) > Version("2.2.1"): + expected_message = re.escape( + "Data must be 1-dimensional, got ndarray of shape (2, 1) instead" + ) + else: + expected_message = "The truth value of a DataFrame is ambiguous." + with pytest.raises(SnowparkSQLException, match=expected_message): # return value snow_df.apply(lambda x: native_pd.DataFrame([1, 2]), axis=0).to_pandas() diff --git a/tests/integ/modin/index/test_df_series_creation_with_index.py b/tests/integ/modin/index/test_df_series_creation_with_index.py index 105d6d1552..5ff3d8b3d3 100644 --- a/tests/integ/modin/index/test_df_series_creation_with_index.py +++ b/tests/integ/modin/index/test_df_series_creation_with_index.py @@ -7,6 +7,7 @@ import numpy as np import pandas as native_pd import pytest +from packaging.version import Version import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import assert_frame_equal, assert_series_equal @@ -1337,13 +1338,16 @@ def test_create_series_with_df_index_negative(): @sql_count_checker(query_count=0) def test_create_series_with_df_data_negative(): - with pytest.raises( - ValueError, - match=re.escape( + if Version(native_pd.__version__) > Version("2.2.1"): + expected_message = re.escape( + "Data must be 1-dimensional, got ndarray of shape (3, 2) instead" + ) + else: + expected_message = re.escape( "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool()" ", a.item(), a.any() or a.all()." - ), - ): + ) + with pytest.raises(ValueError, match=expected_message): native_pd.Series(native_pd.DataFrame([[1, 2], [3, 4], [5, 6]])) with pytest.raises(ValueError, match="Data cannot be a DataFrame"): pd.Series(pd.DataFrame([[1, 2], [3, 4], [5, 6]])) From 2da165be3247ff831af941827b0ec809c207a10c Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 9 Oct 2024 15:13:34 -0700 Subject: [PATCH 13/26] try zip import --- .../modin/test_modin_stored_procedures.py | 58 +++++++++++++------ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index 4f2ae12b24..18cbe888da 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -3,6 +3,8 @@ # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # +import os + import modin.pandas as pd from snowflake.snowpark import Session @@ -22,10 +24,30 @@ "numpy", ] +# Snowpark pandas strictly pins the modin dependency version, so while testing a dependency upgrade, +# we need to upload snowflake-snowpark-python as a zip file. Otherwise, the conda package solver +# will resolve snowflake-snowpark-python==1.16.0, the newest version which does not pin a modin +# version. +# We still specify snowflake-snowpark-python in the package list to prevent the sproc registration +# code from failing in the solver step; the import here will override whatever version is chosen. +IMPORT_LIST = [ + # The current path of this file is `tests/modin/integ/test_modin_stored_procedures.py`, so we need + # to go back to the repository root to reach `src/snowflake/snowpark/`. + ( + os.path.join( + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))) + ), + "src/snowflake/snowpark", + ), + "snowflake.snowpark", + ), +] + @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_head(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame( [["a", 2.1, 1], ["b", 4.2, 2], ["c", 6.3, None]], @@ -42,7 +64,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_dropna(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> int: default_index_snowpark_pandas_df = pd.DataFrame( [["a", 2.1, 1], ["b", None, 2], ["c", 6.3, None]], @@ -57,7 +79,7 @@ def run(session_: Session) -> int: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_idx(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df["a"] @@ -68,7 +90,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_loc(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df.loc[df["a"] > 2] @@ -79,7 +101,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_iloc(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df.iloc[0, 1] @@ -90,7 +112,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_missing_val(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> int: import numpy as np @@ -111,7 +133,7 @@ def run(session_: Session) -> int: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_type_conv(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"int": [1, 2, 3], "str": ["4", "5", "6"]}) df_result = df.astype(float)["int"].iloc[0] @@ -122,14 +144,14 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_binary_ops(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def add(session_: Session) -> str: df_1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) df_2 = pd.DataFrame([[6, 7, 8]]) df_result = df_1.add(df_2) return str(df_result) - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def plus(session_: Session) -> str: s1 = pd.Series([1, 2, 3]) s2 = pd.Series([2, 2, 2]) @@ -143,7 +165,7 @@ def plus(session_: Session) -> str: @multithreaded_run() @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_agg(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run_agg(session_: Session) -> str: import numpy as np @@ -154,7 +176,7 @@ def run_agg(session_: Session) -> str: df_result = df.agg(["sum", "min"]) return str(df_result) - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run_median(session_: Session) -> str: import numpy as np @@ -174,7 +196,7 @@ def run_median(session_: Session) -> str: @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_merge(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run_merge(session_: Session) -> str: df1 = pd.DataFrame( {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} @@ -185,7 +207,7 @@ def run_merge(session_: Session) -> str: df_result = df1.merge(df2, left_on="lkey", right_on="rkey") return str(df_result["value_x"]) - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run_join(session_: Session) -> str: df = pd.DataFrame( { @@ -209,7 +231,7 @@ def run_join(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_groupby(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame( { @@ -228,7 +250,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_pivot(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame( { @@ -262,7 +284,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_apply(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: import numpy as np @@ -275,7 +297,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_applymap(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> str: df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) df_result = df.applymap(lambda x: len(str(x))) @@ -286,7 +308,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_devguide_example(session): - @sproc(packages=PACKAGE_LIST) + @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) def run(session_: Session) -> int: # Create a Snowpark Pandas DataFrame with sample data. df = pd.DataFrame( From fd71a7419ec9b39ee49e7e277d6c25580f734340 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 9 Oct 2024 16:51:11 -0700 Subject: [PATCH 14/26] temporarily xfail map/applymap --- tests/integ/modin/test_modin_stored_procedures.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index 18cbe888da..b0e65dd57b 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -6,6 +6,7 @@ import os import modin.pandas as pd +import pytest from snowflake.snowpark import Session from snowflake.snowpark.functions import sproc @@ -282,6 +283,8 @@ def run(session_: Session) -> str: ) +# TODO add ticket to debug +@pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_apply(session): @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) @@ -295,6 +298,8 @@ def run(session_: Session) -> str: assert run() == "0 2\n1 10\n2 13\ndtype: int64" +# TODO add ticket to debug +@pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_applymap(session): @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) From 86a996dd9cb65eff42e6f6819bf5084057fab81b Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 10 Oct 2024 16:18:06 -0700 Subject: [PATCH 15/26] pragma no cover --- .../snowpark/modin/plugin/extensions/base_overrides.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index e26986270e..2e95c1b571 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -1552,7 +1552,7 @@ def __getitem__(self, key): return self.loc[:, key] -# Modin uses the unique() query compiler method as of 0.30.1. +# Modin uses the unique() query compiler method instead of aliasing the duplicated frontend method as of 0.30.1. @register_base_override("drop_duplicates") def drop_duplicates( self, keep="first", inplace=False, **kwargs @@ -1566,7 +1566,7 @@ def drop_duplicates( if subset is not None: if is_list_like(subset): if not isinstance(subset, list): - subset = list(subset) + subset = list(subset) # pragma: no cover else: subset = [subset] df = self[subset] @@ -1577,7 +1577,7 @@ def drop_duplicates( if ignore_index: result.index = pandas.RangeIndex(stop=len(result)) if inplace: - self._update_inplace(result._query_compiler) + self._update_inplace(result._query_compiler) # pragma: no cover else: return result From 7ff0c6c5e4abd66bde70b076aafe7c414b0b4f25 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Thu, 10 Oct 2024 17:02:06 -0700 Subject: [PATCH 16/26] move changelog entry --- CHANGELOG.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d794cbc810..d9151a1a2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,11 @@ ### Snowpark pandas API Updates +### Dependency Updates + +- Updated `modin` from 0.28.1 to 0.30.1. +- Added support for `pandas` 2.2.2 and 2.2.3. + #### New Features - Added support for `np.subtract`, `np.multiply`, `np.divide`, and `np.true_divide`. @@ -91,11 +96,6 @@ ### Snowpark pandas API Updates -### Dependency Updates - -- Updated `modin` from 0.28.1 to 0.30.1. -- Added support for `pandas` 2.2.2 and 2.2.3. - #### New Features - Added support for `TimedeltaIndex.mean` method. From df19140bf38a94c2a6e058fd0c64c995c7ecd391 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Fri, 11 Oct 2024 16:36:02 -0700 Subject: [PATCH 17/26] update changelog wording --- .github/workflows/daily_modin_precommit.yml | 6 ++++-- CHANGELOG.md | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index df9af37d89..e0e0e60cee 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -143,9 +143,10 @@ jobs: run: python -m pip install tox - if: ${{ contains('macos', matrix.os.download_name) }} name: Run Snowpark pandas API doctests - run: python -m tox -e "py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" env: PYTHON_VERSION: ${{ matrix.python-version }} + MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short TOX_PARALLEL_NO_SPINNER: 1 @@ -153,9 +154,10 @@ jobs: # For example, see https://github.com/snowflakedb/snowpark-python/pull/681 shell: bash - name: Run Snowpark pandas API tests (excluding doctests) - run: python -m tox -e "py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" + run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" env: PYTHON_VERSION: ${{ matrix.python-version }} + MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short TOX_PARALLEL_NO_SPINNER: 1 diff --git a/CHANGELOG.md b/CHANGELOG.md index d9151a1a2d..64582e1f77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,7 +30,7 @@ ### Dependency Updates - Updated `modin` from 0.28.1 to 0.30.1. -- Added support for `pandas` 2.2.2 and 2.2.3. +- Added support for all `pandas` 2.2.x versions. #### New Features From 06c552afd0aeda584b68e154ca9f38d2838b535d Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 15 Oct 2024 16:23:58 -0700 Subject: [PATCH 18/26] xfail udf tests with 2.2.3 --- .github/workflows/daily_modin_precommit.yml | 8 +++----- .github/workflows/precommit.yml | 6 ++---- tests/integ/modin/frame/test_apply.py | 5 +++++ tests/integ/modin/frame/test_applymap.py | 5 +++++ tests/integ/modin/groupby/test_groupby_apply.py | 5 +++++ tests/integ/modin/groupby/test_groupby_transform.py | 5 +++++ tests/integ/modin/series/test_apply.py | 5 +++++ tests/integ/modin/test_modin_stored_procedures.py | 4 ++-- tests/integ/test_stored_procedure.py | 5 +++++ tox.ini | 2 +- 10 files changed, 38 insertions(+), 12 deletions(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index e0e0e60cee..8838f81f5e 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -143,10 +143,9 @@ jobs: run: python -m pip install tox - if: ${{ contains('macos', matrix.os.download_name) }} name: Run Snowpark pandas API doctests - run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + run: python -m tox -e "py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" env: PYTHON_VERSION: ${{ matrix.python-version }} - MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short TOX_PARALLEL_NO_SPINNER: 1 @@ -154,10 +153,9 @@ jobs: # For example, see https://github.com/snowflakedb/snowpark-python/pull/681 shell: bash - name: Run Snowpark pandas API tests (excluding doctests) - run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" + run: python -m tox -e "py${PYTHON_VERSION/\./}-snowparkpandasdailynotdoctest-modin-ci" env: PYTHON_VERSION: ${{ matrix.python-version }} - MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short TOX_PARALLEL_NO_SPINNER: 1 @@ -185,7 +183,7 @@ jobs: os: - image_name: ubuntu-latest-64-cores download_name: linux - pandas-version: ["2.2.1", "2.2.2"] # TODO include 2.2.3 once it's available in snowflake anaconda + pandas-version: ["2.2.1", "2.2.2", "2.2.3"] python-version: ["3.9"] cloud-provider: [aws] steps: diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml index 8e5c0a406a..40776bf984 100644 --- a/.github/workflows/precommit.yml +++ b/.github/workflows/precommit.yml @@ -382,9 +382,8 @@ jobs: # only run doctest for macos on aws - if: ${{ matrix.os == 'macos-latest' && matrix.cloud-provider == 'aws' }} name: Run Snowpark pandas API doctests - run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" + run: python -m tox -e "py${PYTHON_VERSION}-doctest-snowparkpandasdoctest-modin-ci" env: - MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda PYTHON_VERSION: ${{ matrix.python-version }} cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short @@ -395,9 +394,8 @@ jobs: # do not run other tests for macos on aws - if: ${{ !(matrix.os == 'macos-latest' && matrix.cloud-provider == 'aws') }} name: Run Snowpark pandas API tests (excluding doctests) - run: python -m tox -e "modin_pandas_version-py${PYTHON_VERSION/\./}-snowparkpandasnotdoctest-modin-ci" + run: python -m tox -e "py${PYTHON_VERSION/\./}-snowparkpandasnotdoctest-modin-ci" env: - MODIN_PANDAS_PATCH_VERSION: 2.2.2 # TODO unpin when 2.2.3 is available in snowflake anaconda PYTHON_VERSION: ${{ matrix.python-version }} cloud_provider: ${{ matrix.cloud-provider }} PYTEST_ADDOPTS: --color=yes --tb=short diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index e76c3f9e28..ff4e7eaac7 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -30,6 +30,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # TODO SNOW-891796: replace native_pd with pd after allowing using snowpandas module/function in UDF # test data which has a python type as return type that is not a pandas Series/pandas DataFrame/tuple/list diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py index 91b69c5142..73cdfc1c64 100644 --- a/tests/integ/modin/frame/test_applymap.py +++ b/tests/integ/modin/frame/test_applymap.py @@ -21,6 +21,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + @pytest.mark.parametrize("data,func,return_type", BASIC_DATA_FUNC_RETURN_TYPE_MAP) @sql_count_checker(query_count=7, udf_count=1) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index c6c805a0ca..11efb83000 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -27,6 +27,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # Use the workaround shown below for applying functions that are attributes # of this module. # https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index 1dbf143de8..dff8f5fb0a 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -15,6 +15,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + def eval_snowpark_pandas_result(*args, **kwargs): # Some calls to the native pandas function propagate attrs while some do not, depending on the values of its arguments. diff --git a/tests/integ/modin/series/test_apply.py b/tests/integ/modin/series/test_apply.py index 5ab9ea486f..729e414bd2 100644 --- a/tests/integ/modin/series/test_apply.py +++ b/tests/integ/modin/series/test_apply.py @@ -30,6 +30,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + BASIC_DATA_FUNC_RETURN_TYPE_MAP = [ ([1, 2, 3, None], lambda x: x + 1, "int"), param( diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index b0e65dd57b..18b940a5af 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -283,7 +283,7 @@ def run(session_: Session) -> str: ) -# TODO add ticket to debug +# TODO SNOW-1739042 figure out why apply/applymap UDF doesn't use the correct modin/snowpark version @pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_apply(session): @@ -298,7 +298,7 @@ def run(session_: Session) -> str: assert run() == "0 2\n1 10\n2 13\ndtype: int64" -# TODO add ticket to debug +# TODO SNOW-1739042 figure out why apply/applymap UDF doesn't use the correct modin/snowpark version @pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_applymap(session): diff --git a/tests/integ/test_stored_procedure.py b/tests/integ/test_stored_procedure.py index 5cd8c42d93..41b3e87e67 100644 --- a/tests/integ/test_stored_procedure.py +++ b/tests/integ/test_stored_procedure.py @@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Union from unittest.mock import patch +import pandas as native_pd import pytest try: @@ -75,6 +76,10 @@ ) pytestmark = [ + pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ), pytest.mark.udf, ] diff --git a/tox.ini b/tox.ini index d98533795e..6021cb96e3 100644 --- a/tox.ini +++ b/tox.ini @@ -66,8 +66,8 @@ setenv = SNOWFLAKE_PYTEST_DAILY_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_DAILY_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources {env:SNOWFLAKE_PYTEST_IGNORE_MODIN_CMD} # This configures the extra dependency required by modin test modin: SNOWFLAKE_PYTEST_MODIN_DEPS = [modin-development] - SNOW_1314507_WORKAROUND_RERUN_FLAGS = --reruns 5 --reruns-delay 1 --only-rerun "Insufficient resource during interleaved execution." modin_pandas_version: SNOWFLAKE_PYTEST_PANDAS_DEPS = pandas=={env:MODIN_PANDAS_PATCH_VERSION} + SNOW_1314507_WORKAROUND_RERUN_FLAGS = --reruns 5 --reruns-delay 1 --only-rerun "Insufficient resource during interleaved execution." MODIN_PYTEST_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources MODIN_PYTEST_DAILY_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_DAILY_PARALLELISM:} {env:SNOWFLAKE_PYTEST_COV_CMD} --ignore=tests/resources MODIN_PYTEST_NO_COV_CMD = pytest {env:SNOWFLAKE_PYTEST_VERBOSITY:} {env:SNOWFLAKE_PYTEST_PARALLELISM:} --ignore=tests/resources From f01a3dbdc43a459613732733ce18246c3beb5192 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 15 Oct 2024 16:28:23 -0700 Subject: [PATCH 19/26] fix missed skips --- tests/integ/modin/frame/test_apply_axis_0.py | 5 +++++ tests/integ/modin/frame/test_cache_result.py | 4 ++++ tests/integ/modin/groupby/test_all_any.py | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/tests/integ/modin/frame/test_apply_axis_0.py b/tests/integ/modin/frame/test_apply_axis_0.py index 33db3c2220..49e865105b 100644 --- a/tests/integ/modin/frame/test_apply_axis_0.py +++ b/tests/integ/modin/frame/test_apply_axis_0.py @@ -23,6 +23,11 @@ ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker +pytestmark = pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) + # test data which has a python type as return type that is not a pandas Series/pandas DataFrame/tuple/list BASIC_DATA_FUNC_PYTHON_RETURN_TYPE_MAP = [ [[[1.0, 2.2], [3, np.nan]], np.min, "float"], diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 5fb094f4ed..7439000c8d 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -204,6 +204,10 @@ def test_cache_result_post_apply(self, inplace, simple_test_data): native_df, ) + @pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ) def test_cache_result_post_applymap(self, inplace, simple_test_data): # The high query counts in this test case come from the setup and definition # of the UDFs used. diff --git a/tests/integ/modin/groupby/test_all_any.py b/tests/integ/modin/groupby/test_all_any.py index a13712d9a8..fbd77bbaad 100644 --- a/tests/integ/modin/groupby/test_all_any.py +++ b/tests/integ/modin/groupby/test_all_any.py @@ -98,6 +98,10 @@ def test_all_any_invalid_types(data, msg): pd.DataFrame(data).groupby("by").any().to_pandas() +@pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @sql_count_checker(query_count=5, join_count=1, udtf_count=1) def test_all_any_chained(): data = { From ad447e0cace2d91a76ba53f8441a8db8ded8c597 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Wed, 16 Oct 2024 15:13:47 -0700 Subject: [PATCH 20/26] add more skips --- tests/integ/modin/frame/test_cache_result.py | 4 ++++ tests/integ/modin/test_modin_stored_procedures.py | 8 ++++++++ tests/integ/modin/test_session.py | 5 +++++ tests/integ/test_stored_procedure.py | 5 ----- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 7439000c8d..10a91fe85a 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -176,6 +176,10 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data): cached_snow_df, native_df ) + @pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ) def test_cache_result_post_apply(self, inplace, simple_test_data): # In this test, the caching doesn't aid in the query counts since # the implementation of apply(axis=1) itself contains intermediate diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index 18b940a5af..11733a2ff6 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -6,6 +6,7 @@ import os import modin.pandas as pd +import pandas as native_pd import pytest from snowflake.snowpark import Session @@ -17,6 +18,13 @@ from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import multithreaded_run +pytestmark = ( + pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", + ), +) + PACKAGE_LIST = [ # modin 0.30.1 supports any pandas 2.2.x, so just pick whichever one is installed in the client f"pandas=={actual_pandas_version}", diff --git a/tests/integ/modin/test_session.py b/tests/integ/modin/test_session.py index 93b4cecb6e..d5c20603d8 100644 --- a/tests/integ/modin/test_session.py +++ b/tests/integ/modin/test_session.py @@ -5,6 +5,7 @@ from typing import Optional import modin.pandas as pd +import pandas as native_pd import pytest import snowflake.snowpark.modin.plugin # noqa: F401 @@ -212,6 +213,10 @@ def test_snowpark_pandas_session_class_does_not_exist_snow_1022098(): pd.Session +@pytest.mark.skipif( + native_pd.__version__ == "2.2.3", + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @pytest.mark.parametrize( "operation", [ diff --git a/tests/integ/test_stored_procedure.py b/tests/integ/test_stored_procedure.py index 41b3e87e67..5cd8c42d93 100644 --- a/tests/integ/test_stored_procedure.py +++ b/tests/integ/test_stored_procedure.py @@ -10,7 +10,6 @@ from typing import Dict, List, Optional, Union from unittest.mock import patch -import pandas as native_pd import pytest try: @@ -76,10 +75,6 @@ ) pytestmark = [ - pytest.mark.skipif( - native_pd.__version__ == "2.2.3", - reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", - ), pytest.mark.udf, ] From c9c6e205ea98b02cb0d9e5d26daeddf12c0d18f4 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 13:43:54 -0700 Subject: [PATCH 21/26] strengthen version checks --- .../snowpark/modin/plugin/__init__.py | 6 +- .../modin/plugin/extensions/base_overrides.py | 1 + tests/integ/modin/frame/test_apply.py | 3 +- tests/integ/modin/frame/test_apply_axis_0.py | 3 +- tests/integ/modin/frame/test_applymap.py | 3 +- tests/integ/modin/frame/test_cache_result.py | 5 +- tests/integ/modin/groupby/test_all_any.py | 3 +- .../integ/modin/groupby/test_groupby_apply.py | 3 +- .../modin/groupby/test_groupby_transform.py | 3 +- tests/integ/modin/series/test_apply.py | 3 +- .../modin/test_modin_stored_procedures.py | 88 +++++++------------ tests/integ/modin/test_session.py | 9 +- tests/integ/modin/utils.py | 5 ++ 13 files changed, 66 insertions(+), 69 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/__init__.py b/src/snowflake/snowpark/modin/plugin/__init__.py index c5ea012a25..b60c565ca8 100644 --- a/src/snowflake/snowpark/modin/plugin/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/__init__.py @@ -149,7 +149,11 @@ try_add_telemetry_to_attribute, ) -# Add telemetry on the ModinAPI accessor object +# Add telemetry on the ModinAPI accessor object. +# modin 0.30.1 introduces the pd.DataFrame.modin accessor object for non-pandas methods, +# such as pd.DataFrame.modin.to_pandas and pd.DataFrame.modin.to_ray. We will automatically +# raise NotImplementedError for all methods on this accessor object except to_pandas, but +# we still want to record telemetry. for attr_name in dir(ModinAPI): if not attr_name.startswith("_") or attr_name in TELEMETRY_PRIVATE_METHODS: setattr( diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py index 2e95c1b571..a472e13137 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py @@ -1553,6 +1553,7 @@ def __getitem__(self, key): # Modin uses the unique() query compiler method instead of aliasing the duplicated frontend method as of 0.30.1. +# TODO SNOW-1758721: use the more efficient implementation @register_base_override("drop_duplicates") def drop_duplicates( self, keep="first", inplace=False, **kwargs diff --git a/tests/integ/modin/frame/test_apply.py b/tests/integ/modin/frame/test_apply.py index ff4e7eaac7..eca569aa99 100644 --- a/tests/integ/modin/frame/test_apply.py +++ b/tests/integ/modin/frame/test_apply.py @@ -23,6 +23,7 @@ from snowflake.snowpark.types import DoubleType, PandasSeriesType from tests.integ.modin.series.test_apply import create_func_with_return_type_hint from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, @@ -31,7 +32,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/frame/test_apply_axis_0.py b/tests/integ/modin/frame/test_apply_axis_0.py index 49e865105b..f5b8eaac10 100644 --- a/tests/integ/modin/frame/test_apply_axis_0.py +++ b/tests/integ/modin/frame/test_apply_axis_0.py @@ -16,6 +16,7 @@ from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.series.test_apply import create_func_with_return_type_hint from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, @@ -24,7 +25,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/frame/test_applymap.py b/tests/integ/modin/frame/test_applymap.py index 73cdfc1c64..e24076401d 100644 --- a/tests/integ/modin/frame/test_applymap.py +++ b/tests/integ/modin/frame/test_applymap.py @@ -15,6 +15,7 @@ create_func_with_return_type_hint, ) from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, create_test_dfs, eval_snowpark_pandas_result, @@ -22,7 +23,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py index 10a91fe85a..6678938db1 100644 --- a/tests/integ/modin/frame/test_cache_result.py +++ b/tests/integ/modin/frame/test_cache_result.py @@ -11,6 +11,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, create_test_dfs, ) @@ -177,7 +178,7 @@ def test_cache_result_post_pivot(self, inplace, simple_test_data): ) @pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) def test_cache_result_post_apply(self, inplace, simple_test_data): @@ -209,7 +210,7 @@ def test_cache_result_post_apply(self, inplace, simple_test_data): ) @pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) def test_cache_result_post_applymap(self, inplace, simple_test_data): diff --git a/tests/integ/modin/groupby/test_all_any.py b/tests/integ/modin/groupby/test_all_any.py index fbd77bbaad..6e1d6513a9 100644 --- a/tests/integ/modin/groupby/test_all_any.py +++ b/tests/integ/modin/groupby/test_all_any.py @@ -14,6 +14,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_frame_equal, create_test_dfs, eval_snowpark_pandas_result as _eval_snowpark_pandas_result, @@ -99,7 +100,7 @@ def test_all_any_invalid_types(data, msg): @pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) @sql_count_checker(query_count=5, join_count=1, udtf_count=1) diff --git a/tests/integ/modin/groupby/test_groupby_apply.py b/tests/integ/modin/groupby/test_groupby_apply.py index 11efb83000..7905842ee1 100644 --- a/tests/integ/modin/groupby/test_groupby_apply.py +++ b/tests/integ/modin/groupby/test_groupby_apply.py @@ -18,6 +18,7 @@ from snowflake.snowpark.exceptions import SnowparkSQLException from snowflake.snowpark.modin.plugin.extensions.utils import try_convert_index_to_native from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, assert_values_equal, @@ -28,7 +29,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py index dff8f5fb0a..3a950b993a 100644 --- a/tests/integ/modin/groupby/test_groupby_transform.py +++ b/tests/integ/modin/groupby/test_groupby_transform.py @@ -10,13 +10,14 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, create_test_dfs, eval_snowpark_pandas_result as _eval_snowpark_pandas_result, ) from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/series/test_apply.py b/tests/integ/modin/series/test_apply.py index 729e414bd2..b3c2ec9815 100644 --- a/tests/integ/modin/series/test_apply.py +++ b/tests/integ/modin/series/test_apply.py @@ -21,6 +21,7 @@ from snowflake.snowpark.functions import udf from snowflake.snowpark.types import DoubleType, StringType, VariantType from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, ColumnSchema, assert_snowpark_pandas_equal_to_pandas, assert_snowpark_pandas_equals_to_pandas_without_dtypecheck, @@ -31,7 +32,7 @@ from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker pytestmark = pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) diff --git a/tests/integ/modin/test_modin_stored_procedures.py b/tests/integ/modin/test_modin_stored_procedures.py index 11733a2ff6..8387c75935 100644 --- a/tests/integ/modin/test_modin_stored_procedures.py +++ b/tests/integ/modin/test_modin_stored_procedures.py @@ -3,60 +3,40 @@ # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved. # -import os - import modin.pandas as pd import pandas as native_pd import pytest +from packaging import version from snowflake.snowpark import Session from snowflake.snowpark.functions import sproc -from snowflake.snowpark.modin.plugin import ( - actual_pandas_version, - supported_modin_version, -) from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import multithreaded_run -pytestmark = ( - pytest.mark.skipif( - native_pd.__version__ == "2.2.3", - reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", - ), +pytestmark = pytest.mark.skipif( + version.parse(native_pd.__version__) != version.parse("2.2.1"), + reason="SNOW-1758760: modin stored procedure test must pin pandas==2.2.1 and modin==0.28.1", ) +# Must pin modin version to match version available in Snowflake Anaconda +SPROC_MODIN_VERSION = "0.28.1" + PACKAGE_LIST = [ - # modin 0.30.1 supports any pandas 2.2.x, so just pick whichever one is installed in the client - f"pandas=={actual_pandas_version}", - f"modin=={supported_modin_version}", + # modin 0.30.1 supports any pandas 2.2.x, so just pick whichever one is installed in the client. + # Note that because we specify `snowflake-snowpark-python` as a package here, it will pick whatever + # version of the package is available in anaconda, not the latest `main` branch. + # The behavior of stored procedures with `main` is verified in server-side tests and the stored + # procedure Jenkins job. + f"pandas=={native_pd.__version__}", + f"modin=={SPROC_MODIN_VERSION}", "snowflake-snowpark-python", "numpy", ] -# Snowpark pandas strictly pins the modin dependency version, so while testing a dependency upgrade, -# we need to upload snowflake-snowpark-python as a zip file. Otherwise, the conda package solver -# will resolve snowflake-snowpark-python==1.16.0, the newest version which does not pin a modin -# version. -# We still specify snowflake-snowpark-python in the package list to prevent the sproc registration -# code from failing in the solver step; the import here will override whatever version is chosen. -IMPORT_LIST = [ - # The current path of this file is `tests/modin/integ/test_modin_stored_procedures.py`, so we need - # to go back to the repository root to reach `src/snowflake/snowpark/`. - ( - os.path.join( - os.path.dirname( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))) - ), - "src/snowflake/snowpark", - ), - "snowflake.snowpark", - ), -] - @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_head(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame( [["a", 2.1, 1], ["b", 4.2, 2], ["c", 6.3, None]], @@ -73,7 +53,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_dropna(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> int: default_index_snowpark_pandas_df = pd.DataFrame( [["a", 2.1, 1], ["b", None, 2], ["c", 6.3, None]], @@ -88,7 +68,7 @@ def run(session_: Session) -> int: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_idx(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df["a"] @@ -99,7 +79,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_loc(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df.loc[df["a"] > 2] @@ -110,7 +90,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_iloc(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) df_result = df.iloc[0, 1] @@ -121,7 +101,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_missing_val(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> int: import numpy as np @@ -142,7 +122,7 @@ def run(session_: Session) -> int: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_type_conv(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame({"int": [1, 2, 3], "str": ["4", "5", "6"]}) df_result = df.astype(float)["int"].iloc[0] @@ -153,14 +133,14 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_binary_ops(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def add(session_: Session) -> str: df_1 = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) df_2 = pd.DataFrame([[6, 7, 8]]) df_result = df_1.add(df_2) return str(df_result) - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def plus(session_: Session) -> str: s1 = pd.Series([1, 2, 3]) s2 = pd.Series([2, 2, 2]) @@ -174,7 +154,7 @@ def plus(session_: Session) -> str: @multithreaded_run() @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_agg(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run_agg(session_: Session) -> str: import numpy as np @@ -185,7 +165,7 @@ def run_agg(session_: Session) -> str: df_result = df.agg(["sum", "min"]) return str(df_result) - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run_median(session_: Session) -> str: import numpy as np @@ -205,7 +185,7 @@ def run_median(session_: Session) -> str: @sql_count_checker(query_count=8, sproc_count=2) def test_sproc_merge(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run_merge(session_: Session) -> str: df1 = pd.DataFrame( {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]} @@ -216,7 +196,7 @@ def run_merge(session_: Session) -> str: df_result = df1.merge(df2, left_on="lkey", right_on="rkey") return str(df_result["value_x"]) - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run_join(session_: Session) -> str: df = pd.DataFrame( { @@ -240,7 +220,7 @@ def run_join(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_groupby(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame( { @@ -259,7 +239,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_pivot(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame( { @@ -291,11 +271,9 @@ def run(session_: Session) -> str: ) -# TODO SNOW-1739042 figure out why apply/applymap UDF doesn't use the correct modin/snowpark version -@pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_apply(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: import numpy as np @@ -306,11 +284,9 @@ def run(session_: Session) -> str: assert run() == "0 2\n1 10\n2 13\ndtype: int64" -# TODO SNOW-1739042 figure out why apply/applymap UDF doesn't use the correct modin/snowpark version -@pytest.mark.xfail(strict=True) @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_applymap(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> str: df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) df_result = df.applymap(lambda x: len(str(x))) @@ -321,7 +297,7 @@ def run(session_: Session) -> str: @sql_count_checker(query_count=4, sproc_count=1) def test_sproc_devguide_example(session): - @sproc(packages=PACKAGE_LIST, imports=IMPORT_LIST) + @sproc(packages=PACKAGE_LIST) def run(session_: Session) -> int: # Create a Snowpark Pandas DataFrame with sample data. df = pd.DataFrame( diff --git a/tests/integ/modin/test_session.py b/tests/integ/modin/test_session.py index d5c20603d8..b624dd3fc6 100644 --- a/tests/integ/modin/test_session.py +++ b/tests/integ/modin/test_session.py @@ -5,7 +5,6 @@ from typing import Optional import modin.pandas as pd -import pandas as native_pd import pytest import snowflake.snowpark.modin.plugin # noqa: F401 @@ -17,7 +16,11 @@ _get_active_sessions, _remove_session, ) -from tests.integ.modin.utils import create_test_dfs, eval_snowpark_pandas_result +from tests.integ.modin.utils import ( + PANDAS_VERSION_PREDICATE, + create_test_dfs, + eval_snowpark_pandas_result, +) from tests.integ.utils.sql_counter import sql_count_checker from tests.utils import running_on_jenkins, running_on_public_ci @@ -214,7 +217,7 @@ def test_snowpark_pandas_session_class_does_not_exist_snow_1022098(): @pytest.mark.skipif( - native_pd.__version__ == "2.2.3", + PANDAS_VERSION_PREDICATE, reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", ) @pytest.mark.parametrize( diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py index d4dd5dd96f..f714a268e5 100644 --- a/tests/integ/modin/utils.py +++ b/tests/integ/modin/utils.py @@ -15,6 +15,7 @@ import pandas.testing as tm import pytest from modin.pandas import DataFrame, Index, Series +from packaging import version from pandas import isna from pandas._typing import Scalar from pandas.core.dtypes.common import is_list_like @@ -27,6 +28,10 @@ from snowflake.snowpark.session import Session from snowflake.snowpark.types import StructField, StructType +PANDAS_VERSION_PREDICATE = version.parse(native_pd.__version__) >= version.parse( + "2.2.3" +) + ValuesEqualType = Optional[ Union[ Scalar, From 40275b530937df58edb85e54ac7c350570d0285a Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 13:44:57 -0700 Subject: [PATCH 22/26] remove 2.2.3 from daily matrix --- .github/workflows/daily_modin_precommit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/daily_modin_precommit.yml b/.github/workflows/daily_modin_precommit.yml index 8838f81f5e..124f40fbc0 100644 --- a/.github/workflows/daily_modin_precommit.yml +++ b/.github/workflows/daily_modin_precommit.yml @@ -183,7 +183,7 @@ jobs: os: - image_name: ubuntu-latest-64-cores download_name: linux - pandas-version: ["2.2.1", "2.2.2", "2.2.3"] + pandas-version: ["2.2.1", "2.2.2"] python-version: ["3.9"] cloud-provider: [aws] steps: From 8559b7b6a89683ab69318012ce5f92cc136124f0 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 13:48:07 -0700 Subject: [PATCH 23/26] add versoin check ticket --- src/snowflake/snowpark/modin/plugin/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/snowflake/snowpark/modin/plugin/__init__.py b/src/snowflake/snowpark/modin/plugin/__init__.py index b60c565ca8..2e9fbe721f 100644 --- a/src/snowflake/snowpark/modin/plugin/__init__.py +++ b/src/snowflake/snowpark/modin/plugin/__init__.py @@ -21,6 +21,7 @@ # since modin may raise its own warnings/errors on the wrong pandas version import pandas # isort: skip # noqa: E402 +# TODO SNOW-1758773: perform version check in modin instead supported_pandas_major_version = 2 supported_pandas_minor_version = 2 actual_pandas_version = version.parse(pandas.__version__) From 5bb1bc626c293a776c23f17e06596fbb333e39b3 Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 14:39:45 -0700 Subject: [PATCH 24/26] skip a test --- tests/integ/modin/test_sql_counter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integ/modin/test_sql_counter.py b/tests/integ/modin/test_sql_counter.py index a1d7f9e61d..cffd826735 100644 --- a/tests/integ/modin/test_sql_counter.py +++ b/tests/integ/modin/test_sql_counter.py @@ -8,7 +8,7 @@ import snowflake.snowpark.modin.plugin # noqa: F401 from snowflake.snowpark import QueryRecord -from tests.integ.modin.utils import assert_frame_equal +from tests.integ.modin.utils import PANDAS_VERSION_PREDICATE, assert_frame_equal from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker @@ -126,6 +126,10 @@ def test_sql_counter_with_fallback_count(): assert len(df) == 3 +@pytest.mark.skipif( + PANDAS_VERSION_PREDICATE, + reason="SNOW-1739034: tests with UDFs/sprocs cannot run without pandas 2.2.3 in Snowflake anaconda", +) @sql_count_checker(query_count=5, join_count=2, udtf_count=1) def test_sql_counter_with_df_udtf_count(): df = pd.DataFrame([[1, 2], [3, 4]]).apply(lambda x: str(type(x)), axis=1, raw=True) From 9ee5cb8ac67a52130123e26bece46aa19b3e836a Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 14:42:02 -0700 Subject: [PATCH 25/26] remove melt override --- .../plugin/extensions/dataframe_overrides.py | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py index 3d8aecbad2..9aea36f6bd 100644 --- a/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py +++ b/src/snowflake/snowpark/modin/plugin/extensions/dataframe_overrides.py @@ -1481,42 +1481,6 @@ def mask( ) -# Snowpark pandas has a fix for a pandas behavior change. It is available in Modin 0.30.1 (SNOW-1552497). -@register_dataframe_accessor("melt") -def melt( - self, - id_vars=None, - value_vars=None, - var_name=None, - value_name="value", - col_level=None, - ignore_index=True, -): # noqa: PR01, RT01, D200 - """ - Unpivot a ``DataFrame`` from wide to long format, optionally leaving identifiers set. - """ - # TODO: SNOW-1063346: Modin upgrade - modin.pandas.DataFrame functions - if id_vars is None: - id_vars = [] - if not is_list_like(id_vars): - id_vars = [id_vars] - if value_vars is None: - value_vars = self.columns.drop(id_vars) - if var_name is None: - columns_name = self._query_compiler.get_index_name(axis=1) - var_name = columns_name if columns_name is not None else "variable" - return self.__constructor__( - query_compiler=self._query_compiler.melt( - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, - ignore_index=ignore_index, - ) - ) - - # Snowpark pandas does more thorough error checking. @register_dataframe_accessor("merge") def merge( From 5f59431931d94f595d628e0d11b50fba9888eedd Mon Sep 17 00:00:00 2001 From: Jonathan Shi Date: Tue, 22 Oct 2024 15:30:17 -0700 Subject: [PATCH 26/26] skip doctests --- .../snowpark/modin/plugin/docstrings/dataframe.py | 10 ++++++---- .../snowpark/modin/plugin/docstrings/groupby.py | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py index 2aef74e0f0..d8fabee7e5 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/dataframe.py @@ -713,6 +713,7 @@ def aggregate(): agg = aggregate def apply(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Apply a function along an axis of the DataFrame. @@ -821,7 +822,7 @@ def apply(): Using a reducing function on ``axis=1``: - >>> df.apply(np.sum, axis=1) + >>> df.apply(np.sum, axis=1) # doctest: +SKIP 0 2 1 10 2 13 @@ -829,7 +830,7 @@ def apply(): Returning a list-like object will result in a Series: - >>> df.apply(lambda x: [1, 2], axis=1) + >>> df.apply(lambda x: [1, 2], axis=1) # doctest: +SKIP 0 [1, 2] 1 [1, 2] 2 [1, 2] @@ -1022,6 +1023,7 @@ def keys(): """ def transform(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Call ``func`` on self producing a Snowpark pandas DataFrame with the same axis shape as self. @@ -1055,7 +1057,7 @@ def transform(): 0 1 3 1 2 4 2 3 5 - >>> df.transform(lambda x: x + 1, axis=1) + >>> df.transform(lambda x: x + 1, axis=1) # doctest: +SKIP col1 col2 0 2 4 1 3 5 @@ -1063,7 +1065,7 @@ def transform(): Apply a numpy ufunc to every value in the DataFrame. - >>> df.transform(np.square, axis=1) + >>> df.transform(np.square, axis=1) # doctest: +SKIP col1 col2 0 1 9 1 4 16 diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py index f9260ddb0a..0dbdced47c 100644 --- a/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py +++ b/src/snowflake/snowpark/modin/plugin/docstrings/groupby.py @@ -989,6 +989,7 @@ def cummax(): """ def apply(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Apply function ``func`` group-wise and combine the results together. @@ -1050,7 +1051,7 @@ def apply(): its argument and returns a DataFrame. `apply` combines the result for each group together into a new DataFrame: - >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE + >>> g1[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP B C 0.0 0.333333 0.4 1.0 0.666667 0.6 @@ -1059,7 +1060,7 @@ def apply(): In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +NORMALIZE_WHITESPACE + >>> g2[['B', 'C']].apply(lambda x: x.select_dtypes('number') / x.select_dtypes('number').sum()) # doctest: +SKIP B C A a 0.0 0.333333 0.4 @@ -1942,6 +1943,7 @@ def cov(): pass def transform(): + # TODO SNOW-1739034 unskip UDF tests when pandas 2.2.3 is available in anaconda """ Call function producing a same-indexed DataFrame on each group. @@ -2011,7 +2013,7 @@ def transform(): i X 9 90 -9 j Y 10 10 -10 - >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2) + >>> df.groupby("col1", dropna=True).transform(lambda df, n: df.head(n), n=2) # doctest: +SKIP col2 col3 col4 a 1.0 40.0 -1.0 b NaN NaN NaN @@ -2024,7 +2026,7 @@ def transform(): i NaN NaN NaN j 10.0 10.0 -10.0 - >>> df.groupby("col1", dropna=False).transform("mean") + >>> df.groupby("col1", dropna=False).transform("mean") # doctest: +SKIP col2 col3 col4 a 2.50 25.0 -2.50 b 5.00 65.0 -5.00