fix

snowflakedb · Oct 22, 2024 · 7751757 · 7751757
1 parent cf91b8d
commit 7751757
Show file tree

Hide file tree

Showing 19 changed files with 111 additions and 101 deletions.
diff --git a/tests/integ/modin/crosstab/test_crosstab.py b/tests/integ/modin/crosstab/test_crosstab.py
@@ -319,7 +319,7 @@ def test_margins(self, dropna, a, b, c):
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize(self, dropna, normalize, a, b, c):
         query_count = 1 if normalize in (0, "index") else 2
-        join_count = 3 if normalize in (0, "index") else 2
+        join_count = 3 if normalize in (0, "index") and dropna else 2
         if dropna:
             join_count -= 2
 
@@ -340,9 +340,9 @@ def test_normalize(self, dropna, normalize, a, b, c):
     @pytest.mark.parametrize("normalize", [0, 1, True, "all", "index", "columns"])
     def test_normalize_and_margins(self, dropna, normalize, a, b, c):
         counts = {
-            "columns": [3, 5 if dropna else 9, 4],
-            "index": [1, 5 if dropna else 8, 3],
-            "all": [3, 12 if dropna else 19, 7],
+            "columns": [3, 4 if dropna else 7, 3],
+            "index": [1, 3 if dropna else 4, 1],
+            "all": [3, 7 if dropna else 10, 3],
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
@@ -374,8 +374,8 @@ def test_normalize_and_margins(self, dropna, normalize, a, b, c):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_normalize_margins_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
-            "columns": [3, 29 if dropna else 41, 4],
-            "index": [1, 23 if dropna else 32, 3],
+            "columns": [3, 10 if dropna else 13, 3],
+            "index": [1, 5 if dropna else 6, 1],
             "all": [3, 54 if dropna else 75, 7],
         }
         counts[0] = counts["index"]
@@ -451,9 +451,9 @@ def eval_func(lib):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_normalize_and_values(self, dropna, normalize, aggfunc, a, b, c):
         counts = {
-            "columns": [2, 4 if dropna else 10],
-            "index": [1, 5 if dropna else 11],
-            "all": [2, 4 if dropna else 10],
+            "columns": [2, 4 if dropna else 6],
+            "index": [1, 3 if dropna else 4],
+            "all": [2, 4 if dropna else 6],
         }
         counts[0] = counts["index"]
         counts[1] = counts["columns"]
@@ -520,7 +520,7 @@ def test_normalize_margins_and_values_not_supported(
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 1
-        join_count = 2 if dropna else 5
+        join_count = 2 if dropna else 3
         native_df = basic_crosstab_dfs[0]
 
         with SqlCounter(query_count=query_count, join_count=join_count):
@@ -539,7 +539,7 @@ def test_values(self, dropna, aggfunc, basic_crosstab_dfs):
     @pytest.mark.parametrize("aggfunc", ["count", "mean", "min", "max", "sum"])
     def test_values_series_like(self, dropna, aggfunc, basic_crosstab_dfs):
         query_count = 5
-        join_count = 2 if dropna else 5
+        join_count = 2 if dropna else 3
         native_df, snow_df = basic_crosstab_dfs
 
         def eval_func(df):

diff --git a/tests/integ/modin/frame/test_assign.py b/tests/integ/modin/frame/test_assign.py
@@ -60,7 +60,7 @@ def assign_func(df):
 
 @pytest.mark.parametrize("new_col_value", [2, [10, 11, 12], "x"])
 def test_assign_basic_non_pandas_object(new_col_value):
-    join_count = 4 if isinstance(new_col_value, list) else 1
+    join_count = 3 if isinstance(new_col_value, list) else 1
     with SqlCounter(query_count=1, join_count=join_count):
         snow_df, native_df = create_test_dfs(
             [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
@@ -74,7 +74,7 @@ def test_assign_basic_non_pandas_object(new_col_value):
         )
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_assign_invalid_long_column_length_negative():
     # pandas errors out in this test, since we are attempting to assign a column of length 5 to a DataFrame with length 3.
     # Snowpark pandas on the other hand, just truncates the last element of the new column so that it is the correct length. If we wanted
@@ -98,7 +98,7 @@ def test_assign_invalid_long_column_length_negative():
     assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(snow_df, native_df)
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_assign_invalid_short_column_length_negative():
     # pandas errors out in this test, since we are attempting to assign a column of length 2 to a DataFrame with length 3.
     # Snowpark pandas on the other hand, just broadcasts the last element of the new column so that it is filled. If we wanted
@@ -226,7 +226,7 @@ def test_assign_self_columns():
     )
 
 
-@sql_count_checker(query_count=1, join_count=4)
+@sql_count_checker(query_count=1, join_count=3)
 def test_overwrite_columns_via_assign():
     snow_df, native_df = create_test_dfs(
         [[1, 2, 3], [4, 5, 6], [7, 8, 9]],

diff --git a/tests/integ/modin/frame/test_cache_result.py b/tests/integ/modin/frame/test_cache_result.py
@@ -130,7 +130,7 @@ def test_cache_result_simple(self, inplace):
         native_df = perform_chained_operations(
             native_pd.DataFrame(np.arange(15).reshape((3, 5))), native_pd
         )
-        with SqlCounter(query_count=1, union_count=29):
+        with SqlCounter(query_count=1, union_count=11):
             snow_df = perform_chained_operations(snow_df, pd)
             assert_snowpark_pandas_equals_to_pandas_without_dtypecheck(
                 snow_df, native_df
@@ -213,7 +213,7 @@ def test_cache_result_post_applymap(self, inplace, simple_test_data):
         with SqlCounter(
             query_count=11,
             union_count=9,
-            udf_count=2,
+            udf_count=1,
             high_count_expected=True,
             high_count_reason="applymap requires additional queries to setup the UDF.",
         ):

diff --git a/tests/integ/modin/frame/test_describe.py b/tests/integ/modin/frame/test_describe.py
@@ -49,19 +49,19 @@ def test_describe_numeric_only(data):
     # In total, we thus have 2 + 2 * (N - 1 + N) + 1 = 4N + 1 UNIONs for an N-column frame.
     [
         # If there are multiple modes, return the value that appears first
-        ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 9),
+        ({"a": ["k", "j", "j", "k"], "b": ["y", "y", "y", "z"]}, 5),
         # Empty columns are numeric by default (df constructor must explicitly specify object dtype)
-        ({"a": [], "b": []}, 9),
+        ({"a": [], "b": []}, 5),
         # Heterogeneous data is considered non-numeric
-        ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 13),
+        ({2: ["string", 0, None], -1: [1.1, 2.2, "hello"], 0: [None, None, None]}, 6),
         (
             [
                 [None, "quick", None],
                 ["fox", "quick", "lazy"],
                 ["dog", "dog", "lazy"],
                 [None, None, None],
             ],
-            13,
+            6,
         ),
     ],
 )
@@ -107,7 +107,7 @@ def test_describe_empty_cols():
         # 4K-1 UNIONs to compute top/freq for K object-dtype columns (see comment on
         # test_describe_obj_only for reasoning).
         # Since we have K=2 object columns, the result is 9 + (4 * 2 - 1) = 16 UNIONs.
-        ([int, object], None, None, 16),
+        ([int, object], None, None, 12),
         (np.number, [], None, 7),
         # Including only datetimes has 7 statistics since std is not computed.
         # Since there is only 1 column, all quantiles are computed in a single QC.
@@ -127,8 +127,8 @@ def test_describe_empty_cols():
         # include and exclude cannot directly overlap
         ([int, "O"], [float, "O"], ValueError, 0),
         # Like select_dtypes, a dtype in include/exclude can be a subtype of a dtype in the other
-        ([int, "O"], [float, np.number, np.datetime64], None, 9),
-        ("O", None, None, 9),
+        ([int, "O"], [float, np.number, np.datetime64], None, 5),
+        ("O", None, None, 5),
     ],
 )
 def test_describe_include_exclude(
@@ -285,9 +285,9 @@ def timestamp_describe_comparator(snow_res, native_res):
     # Don't need to test all permutations of include/exclude with MultiIndex -- this is covered by
     # tests for select_dtypes, as well as other tests in this file
     [
-        ("all", 16),
+        ("all", 12),
         (np.number, 7),
-        (object, 9),
+        (object, 5),
     ],
 )
 def test_describe_multiindex(index, columns, include, expected_union_count):
@@ -312,10 +312,10 @@ def test_describe_multiindex(index, columns, include, expected_union_count):
     "include, exclude, expected_union_count",
     [
         (None, None, 7),
-        ("all", None, 12),
+        ("all", None, 11),
         (np.number, None, 7),
-        (None, float, 10),
-        (object, None, 5),
+        (None, float, 9),
+        (object, None, 4),
         (None, object, 7),
         (int, float, 5),
         (float, int, 5),
@@ -350,7 +350,7 @@ def helper(df):
 
 @sql_count_checker(
     query_count=3,
-    union_count=21,
+    union_count=8,
 )
 # SNOW-1320296 - pd.concat SQL Compilation ambigious __row_position__ issue
 def test_describe_object_file(resources_path):

diff --git a/tests/integ/modin/frame/test_loc.py b/tests/integ/modin/frame/test_loc.py
@@ -811,7 +811,7 @@ def loc_set_helper(df):
         else:
             df.loc[pd.Series(row_key), :] = pd.DataFrame(item)
 
-    expected_join_count = 4 if not row_key.dtype == bool else 2
+    expected_join_count = 3 if not row_key.dtype == bool else 2
 
     with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(pd.DataFrame(df), df, loc_set_helper, inplace=True)
@@ -851,7 +851,7 @@ def loc_set_helper(df):
         df.loc[row_key, col_key] = item
 
     expected_join_count = (
-        6 if isinstance(col_key, str) and isinstance(item, list) else 1
+        4 if isinstance(col_key, str) and isinstance(item, list) else 1
     )
 
     with SqlCounter(query_count=1, join_count=expected_join_count):
@@ -914,7 +914,7 @@ def test_df_loc_set_list_like_row_key(row_key, key_type):
     )
 
     expected_join_count = (
-        2 if all(isinstance(i, bool) for i in row_key) and len(row_key) > 0 else 4
+        2 if all(isinstance(i, bool) for i in row_key) and len(row_key) > 0 else 3
     )
 
     # test case for df.loc[row_key] = item
@@ -1220,7 +1220,7 @@ def loc_set_helper(df):
         # otherwise, pandas raise ValueError: cannot reindex on an axis with duplicate labels
         or (df.columns.equals(df.columns.union(col_key)))
     ):
-        query_count, join_count, expect_exception = 1, 4, False
+        query_count, join_count, expect_exception = 1, 3, False
     if isinstance(col_key, native_pd.Series):
         query_count += 1
 
@@ -2696,7 +2696,7 @@ def test_empty_df_loc_set_series_and_list(native_item):
         else native_item
     )
 
-    expected_join_count = 2 if isinstance(native_item, native_pd.Series) else 4
+    expected_join_count = 2 if isinstance(native_item, native_pd.Series) else 3
 
     def setitem_op(df):
         item = native_item if isinstance(df, native_pd.DataFrame) else snow_item
@@ -2761,7 +2761,7 @@ def set_loc_helper(df):
         else:
             df.loc[key] = native_item_df
 
-    expected_join_count = 1 if key == slice(None, None, None) else 4
+    expected_join_count = 1 if key == slice(None, None, None) else 3
     with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(snow_df, native_df, set_loc_helper, inplace=True)
 
@@ -3010,7 +3010,7 @@ def loc_set_helper(df):
                 len(row_key) - len(native_item)
             )
 
-    expected_join_count = 4 if len(item) > 1 else 2
+    expected_join_count = 3 if len(item) > 1 else 2
     # 4 extra queries for index, 1 for converting to native pandas in loc_set_helper, 2 for iter and 1 for tolist
     with SqlCounter(
         query_count=5 if item_type_name == "index" else 1,
@@ -3340,7 +3340,7 @@ def loc_set_helper(df):
         else:
             df.loc[snow_indexers] = item
 
-    expected_join_count = 4
+    expected_join_count = 3
     if isinstance(indexer[0], slice):
         expected_join_count = 1
 

diff --git a/tests/integ/modin/frame/test_name.py b/tests/integ/modin/frame/test_name.py
@@ -39,7 +39,7 @@ def test_create_dataframe_from_object_with_name(sample):
     )
 
 
-@sql_count_checker(query_count=1, join_count=2, union_count=1)
+@sql_count_checker(query_count=1, join_count=1, union_count=1)
 def test_create_dataframe_from_snowpark_pandas_series():
     df = pd.DataFrame([[2, 3, 4], [5, 6, 7]], columns=["X", "Y", "Z"])
     df = pd.DataFrame([df.X, df.iloc[:, 2]])

diff --git a/tests/integ/modin/frame/test_setitem.py b/tests/integ/modin/frame/test_setitem.py
@@ -145,7 +145,7 @@ def setitem(df):
         else:
             df[key] = val
 
-    expected_join_count = 3 if isinstance(key.start, int) else 4
+    expected_join_count = 3
 
     with SqlCounter(query_count=1, join_count=expected_join_count):
         eval_snowpark_pandas_result(snow_df, native_df, setitem, inplace=True)
@@ -246,7 +246,7 @@ def func_insert_new_column(df):
 
 
 # matching_item_row_by_label is False here.
-@sql_count_checker(query_count=2, join_count=8)
+@sql_count_checker(query_count=2, join_count=6)
 def test_df_setitem_array_value():
     # Case: setting an array as a new column (df[col] = arr) copies that data
     data = {"a": [1, 2, 3], "b": [4, 5, 6]}
@@ -376,7 +376,7 @@ def func_insert_new_column(df, column):
     elif isinstance(column, native_pd.Index) and not isinstance(
         column, native_pd.DatetimeIndex
     ):
-        expected_join_count = 4
+        expected_join_count = 3
 
     if (
         key == "a"
@@ -672,7 +672,7 @@ def helper(df):
     def helper(df):
         df["x"] = df.loc[df.b < 0, "b"]
 
-    with SqlCounter(query_count=1, join_count=3):
+    with SqlCounter(query_count=1, join_count=2):
         eval_snowpark_pandas_result(snow_df, native_df, helper, inplace=True)
 
 

diff --git a/tests/integ/modin/groupby/test_groupby_transform.py b/tests/integ/modin/groupby/test_groupby_transform.py
@@ -134,7 +134,7 @@ def test_dataframe_groupby_transform_conflicting_labels_negative():
 
 @sql_count_checker(
     query_count=11,
-    join_count=10,
+    join_count=8,
     udtf_count=2,
     high_count_expected=True,
     high_count_reason="performing two groupby transform operations that use UDTFs and compare with pandas",

diff --git a/tests/integ/modin/pivot/test_pivot_table_dropna.py b/tests/integ/modin/pivot/test_pivot_table_dropna.py
@@ -32,11 +32,11 @@ def test_pivot_table_single_value_with_dropna(df_data_with_nulls, dropna, column
 @pytest.mark.parametrize(
     "aggfunc, expected_join_count",
     [
-        ("mean", 5),
-        ({"D": "max", "E": "sum"}, 3),
-        ({"D": ["count", "max"], "E": ["mean", "sum"]}, 7),
-        ({"D": "min", "E": ["mean"]}, 3),
-        (["min", "max"], 11),
+        ("mean", 3),
+        ({"D": "max", "E": "sum"}, 2),
+        ({"D": ["count", "max"], "E": ["mean", "sum"]}, 4),
+        ({"D": "min", "E": ["mean"]}, 2),
+        (["min", "max"], 6),
     ],
 )
 def test_pivot_table_multiple_values_dropna_nonnull_data(
@@ -60,11 +60,11 @@ def test_pivot_table_multiple_values_dropna_nonnull_data(
 @pytest.mark.parametrize(
     "aggfunc, expected_join_count",
     [
-        ({"E": "count", "F": ["mean", "sum"]}, 5),
-        ({"E": ["min", "max"], "F": ["mean", "sum"]}, 7),
-        (["min", "max"], 7),
-        ({"E": "min", "F": "mean"}, 3),
-        ({"E": "max", "F": "max"}, 3),
+        ({"E": "count", "F": ["mean", "sum"]}, 3),
+        ({"E": ["min", "max"], "F": ["mean", "sum"]}, 4),
+        (["min", "max"], 4),
+        ({"E": "min", "F": "mean"}, 2),
+        ({"E": "max", "F": "max"}, 2),
     ],
 )
 def test_pivot_table_multiple_pivot_values_dropna_null_data(
@@ -106,7 +106,7 @@ def test_pivot_table_single_all_aggfuncs_dropna_and_null_data(
     df_data_with_nulls_2,
     values,
 ):
-    expected_join_count = 19 if len(values) > 1 else 9
+    expected_join_count = 10 if len(values) > 1 else 5
     with SqlCounter(query_count=1, join_count=expected_join_count):
         pivot_table_test_helper(
             df_data_with_nulls_2,