Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SNOW-1705797]: Use cached metadata to make repr faster on simple DataFrames #2760

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -420,10 +420,19 @@ def ensure_row_count_column(self) -> "OrderedDataFrame":
wrap_double_underscore=True,
)[0]
)
ordered_dataframe = self.select(
*self.projected_column_snowflake_quoted_identifiers,
count("*").over().as_(row_count_snowflake_quoted_identifier),
)
if not self.is_projection_of_table():
ordered_dataframe = self.select(
*self.projected_column_snowflake_quoted_identifiers,
count("*").over().as_(row_count_snowflake_quoted_identifier),
)
else:
from snowflake.snowpark.modin.plugin._internal.utils import pandas_lit

row_count = self._dataframe_ref.snowpark_dataframe.count()
ordered_dataframe = self.select(
*self.projected_column_snowflake_quoted_identifiers,
pandas_lit(row_count).as_(row_count_snowflake_quoted_identifier),
)
# inplace update so dataframe_ref can be shared. Note that we keep
# the original ordering columns.
ordered_dataframe.row_count_snowflake_quoted_identifier = (
Expand Down Expand Up @@ -2019,3 +2028,23 @@ def sample(self, n: Optional[int], frac: Optional[float]) -> "OrderedDataFrame":
ordering_columns=self.ordering_columns,
)
)

def is_projection_of_table(self) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about nested projections? This seems to only handle a single level of projection, right? That should still be fine. We can address nested projections in a followup step.

Copy link
Contributor Author

@sfc-gh-rdurrani sfc-gh-rdurrani Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before putting in this PR, I tested it locally and found that this will handle nested projections - e.g. I tried the following:

df = pd.read_snowflake...
df = df[df.columns[:5:-1]]
df  = df.select_dtypes()

and

df = pd.DataFrame(...)
df = df[df.columns[:5:-1]]
df  = df.select_dtypes()

and after each of those lines of code + after the entire block of code, the format of the api_calls method remained the same - i.e. this check will work for nested projections, and the metadata caching of count is passed on for nested projections of that type.

"""
Return whether or not the current OrderedDataFrame is simply a projection of a table.

Returns:
bool
True if the current OrderedDataFrame is simply a projection of a table. False if it represents
a more complex operation.
"""
# If we have only performed projections since creating this DataFrame, it will only contain
# 1 API call in the plan - either `Session.sql` for DataFrames based off of I/O operations
# e.g. `read_snowflake` or `read_csv`, or `Session.create_dataframe` for DataFrames created
# out of Python objects.
snowpark_df = self._dataframe_ref.snowpark_dataframe
snowpark_plan = snowpark_df._plan
return len(snowpark_plan.api_calls) == 1 and any(
accepted_api in snowpark_plan.api_calls[0]["name"]
for accepted_api in ["Session.sql", "Session.create_dataframe"]
)
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_add_suffix.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_df_add_suffix_snowpark_pandas_series(
)


@sql_count_checker(query_count=2)
@sql_count_checker(query_count=3)
def test_df_add_prefix_snowpark_pandas_df(
default_index_snowpark_pandas_df, default_index_native_df
):
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def test_string_sum(data, numeric_only_kwargs):
)


@sql_count_checker(query_count=1)
@sql_count_checker(query_count=2)
def test_string_sum_of_reversed_df():
# check that we get the string concatenation right even when the dataframe
# is not in its original order.
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def func(df):

# Tests that attrs is preserved across `take`, a unary operation that returns a Snowpark pandas object.
# Other unary operators are checked by other tests in the `eval_snowpark_pandas_result` method.
@sql_count_checker(query_count=0)
@sql_count_checker(query_count=1)
def test_df_attrs_take():
def func(df):
df.attrs = {"A": [1], "B": "check me"}
Expand Down
2 changes: 1 addition & 1 deletion tests/integ/modin/frame/test_empty.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
({"A": [np.nan]}, "np nan column"),
],
)
@sql_count_checker(query_count=1)
@sql_count_checker(query_count=2)
def test_dataframe_empty_param(dataframe_input, test_case_name):
eval_snowpark_pandas_result(
pd.DataFrame(dataframe_input),
Expand Down
27 changes: 16 additions & 11 deletions tests/integ/modin/frame/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def get_helper(df):
return df[key]

# 5 extra queries for iter
with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1):
with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1):
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_helper(df):
snowpark_df = pd.DataFrame(native_df)

# 5 extra queries for iter
with SqlCounter(query_count=6 if isinstance(key, native_pd.Index) else 1):
with SqlCounter(query_count=7 if isinstance(key, native_pd.Index) else 1):
eval_snowpark_pandas_result(
snowpark_df,
native_df,
Expand Down Expand Up @@ -320,15 +320,19 @@ def test_df_getitem_calls_getitem():
slice(-100, None, -2),
],
)
@sql_count_checker(query_count=1)
def test_df_getitem_with_slice(
key, default_index_snowpark_pandas_df, default_index_native_df
):
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: df[key],
)
if key.start is None:
expected_query_count = 1
else:
expected_query_count = 2
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: df[key],
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -363,6 +367,7 @@ def test_df_getitem_with_non_int_slice(key):
def test_df_getitem_with_multiindex(
key, default_index_native_df, multiindex_native, native_df_with_multiindex_columns
):
expected_query_count = 2 if isinstance(key, slice) else 1
# Test __getitem__ with df with MultiIndex index.
native_df = default_index_native_df.set_index(multiindex_native)
snowpark_df = pd.DataFrame(native_df)
Expand All @@ -376,21 +381,21 @@ def test_df_getitem_with_multiindex(
)
else _key
)
with SqlCounter(query_count=1):
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(snowpark_df, native_df, lambda df: df[_key])

# Test __getitem__ with df with MultiIndex columns.
native_df = native_df_with_multiindex_columns
snowpark_df = pd.DataFrame(native_df)
with SqlCounter(query_count=1):
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(
snowpark_df, native_df, lambda df: df[key], check_column_type=False
)

# Test __getitem__ with df with MultiIndex index.
native_df = native_df_with_multiindex_columns.set_index(multiindex_native)
snowpark_df = pd.DataFrame(native_df)
with SqlCounter(query_count=1):
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(
snowpark_df, native_df, lambda df: df[key], check_column_type=False
)
Expand Down
60 changes: 31 additions & 29 deletions tests/integ/modin/frame/test_head_tail.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
assert_snowpark_pandas_equals_to_pandas_without_dtypecheck,
eval_snowpark_pandas_result,
)
from tests.integ.utils.sql_counter import sql_count_checker
from tests.integ.utils.sql_counter import SqlCounter


def eval_result_and_query_with_no_join(
Expand All @@ -33,41 +33,43 @@ def eval_result_and_query_with_no_join(
"n",
[1, None, 0, -1, -10, 5, 10],
)
@sql_count_checker(query_count=2)
def test_head_tail(n, default_index_snowpark_pandas_df, default_index_native_df):
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: (df.head() if n is None else df.head(n)),
comparator=eval_result_and_query_with_no_join,
)
expected_query_count = 2 if n == 0 else 3
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: (df.head() if n is None else df.head(n)),
comparator=eval_result_and_query_with_no_join,
)

eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: (df.tail() if n is None else df.tail(n)),
comparator=eval_result_and_query_with_no_join,
)
eval_snowpark_pandas_result(
default_index_snowpark_pandas_df,
default_index_native_df,
lambda df: (df.tail() if n is None else df.tail(n)),
comparator=eval_result_and_query_with_no_join,
)


@pytest.mark.parametrize(
"n",
[1, None, 0, -1, -10, 5, 10],
)
@sql_count_checker(query_count=2)
def test_empty_dataframe(n, empty_snowpark_pandas_df):
eval_snowpark_pandas_result(
empty_snowpark_pandas_df,
native_pd.DataFrame(),
lambda df: (df.head() if n is None else df.head(n)),
comparator=eval_result_and_query_with_no_join,
check_column_type=False,
)
expected_query_count = 2 if n == 0 else 3
with SqlCounter(query_count=expected_query_count):
eval_snowpark_pandas_result(
empty_snowpark_pandas_df,
native_pd.DataFrame(),
lambda df: (df.head() if n is None else df.head(n)),
comparator=eval_result_and_query_with_no_join,
check_column_type=False,
)

eval_snowpark_pandas_result(
empty_snowpark_pandas_df,
native_pd.DataFrame(),
lambda df: (df.tail() if n is None else df.tail(n)),
comparator=eval_result_and_query_with_no_join,
check_column_type=False,
)
eval_snowpark_pandas_result(
empty_snowpark_pandas_df,
native_pd.DataFrame(),
lambda df: (df.tail() if n is None else df.tail(n)),
comparator=eval_result_and_query_with_no_join,
check_column_type=False,
)
Loading
Loading