Skip to content

Commit

Permalink
test: decouple pyspark constructor from pandas (#1834)
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi authored Jan 20, 2025
1 parent 973b499 commit 82da089
Show file tree
Hide file tree
Showing 54 changed files with 140 additions and 128 deletions.
42 changes: 25 additions & 17 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import sys
from copy import deepcopy
from typing import TYPE_CHECKING
from typing import Any
from typing import Callable
Expand All @@ -12,6 +13,8 @@
import pyarrow as pa
import pytest

from narwhals.utils import generate_temporary_column_name

if TYPE_CHECKING:
import duckdb

Expand Down Expand Up @@ -67,64 +70,66 @@ def pytest_collection_modifyitems(
item.add_marker(skip_slow)


def pandas_constructor(obj: Any) -> IntoDataFrame:
def pandas_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
return pd.DataFrame(obj) # type: ignore[no-any-return]


def pandas_nullable_constructor(obj: Any) -> IntoDataFrame:
def pandas_nullable_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
return pd.DataFrame(obj).convert_dtypes(dtype_backend="numpy_nullable") # type: ignore[no-any-return]


def pandas_pyarrow_constructor(obj: Any) -> IntoDataFrame:
def pandas_pyarrow_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
return pd.DataFrame(obj).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return]


def modin_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover
def modin_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover
import modin.pandas as mpd

return mpd.DataFrame(pd.DataFrame(obj)) # type: ignore[no-any-return]


def modin_pyarrow_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover
def modin_pyarrow_constructor(
obj: dict[str, list[Any]],
) -> IntoDataFrame: # pragma: no cover
import modin.pandas as mpd

return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return]


def cudf_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover
def cudf_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame: # pragma: no cover
import cudf

return cudf.DataFrame(obj) # type: ignore[no-any-return]


def polars_eager_constructor(obj: Any) -> IntoDataFrame:
def polars_eager_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
return pl.DataFrame(obj)


def polars_lazy_constructor(obj: Any) -> pl.LazyFrame:
def polars_lazy_constructor(obj: dict[str, list[Any]]) -> pl.LazyFrame:
return pl.LazyFrame(obj)


def duckdb_lazy_constructor(obj: Any) -> duckdb.DuckDBPyRelation:
def duckdb_lazy_constructor(obj: dict[str, list[Any]]) -> duckdb.DuckDBPyRelation:
import duckdb

_df = pl.LazyFrame(obj)
return duckdb.table("_df")


def dask_lazy_p1_constructor(obj: Any) -> IntoFrame: # pragma: no cover
def dask_lazy_p1_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover
import dask.dataframe as dd

return dd.from_dict(obj, npartitions=1) # type: ignore[no-any-return]


def dask_lazy_p2_constructor(obj: Any) -> IntoFrame: # pragma: no cover
def dask_lazy_p2_constructor(obj: dict[str, list[Any]]) -> IntoFrame: # pragma: no cover
import dask.dataframe as dd

return dd.from_dict(obj, npartitions=2) # type: ignore[no-any-return]


def pyarrow_table_constructor(obj: Any) -> IntoDataFrame:
def pyarrow_table_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
return pa.table(obj) # type: ignore[no-any-return]


Expand Down Expand Up @@ -159,13 +164,16 @@ def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]: # pragma: no cove

register(session.stop)

def _constructor(obj: Any) -> IntoFrame:
pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index()
def _constructor(obj: dict[str, list[Any]]) -> IntoFrame:
_obj = deepcopy(obj)
index_col_name = generate_temporary_column_name(n_bytes=8, columns=list(_obj))
_obj[index_col_name] = list(range(len(_obj[next(iter(_obj))])))

return ( # type: ignore[no-any-return]
session.createDataFrame(pd_df)
session.createDataFrame([*zip(*_obj.values())], schema=[*_obj.keys()])
.repartition(2)
.orderBy("index")
.drop("index")
.orderBy(index_col_name)
.drop(index_col_name)
)

return _constructor
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/arg_max_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "i": [3, 1, 5]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0], "i": [3, 1, 5]}


def test_expr_arg_max_expr(
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/arg_min_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}


def test_expr_arg_min_expr(
Expand Down
16 changes: 8 additions & 8 deletions tests/expr_and_series/arithmetic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_arithmetic_expr(
):
request.applymarker(pytest.mark.xfail)

data = {"a": [1.0, 2, 3]}
data = {"a": [1.0, 2.0, 3.0]}
df = nw.from_native(constructor(data))
result = df.select(getattr(nw.col("a"), attr)(rhs))
assert_equal_data(result, {"a": expected})
Expand All @@ -57,7 +57,7 @@ def test_arithmetic_expr(
("__radd__", 1, [2, 3, 4]),
("__rsub__", 1, [0, -1, -2]),
("__rmul__", 2, [2, 4, 6]),
("__rtruediv__", 2.0, [2, 1, 2 / 3]),
("__rtruediv__", 2.0, [2.0, 1.0, 2 / 3]),
("__rfloordiv__", 2, [2, 1, 0]),
("__rmod__", 2, [0, 0, 2]),
("__rpow__", 2, [2, 4, 8]),
Expand Down Expand Up @@ -119,7 +119,7 @@ def test_arithmetic_series(
("__radd__", 1, [2, 3, 4]),
("__rsub__", 1, [0, -1, -2]),
("__rmul__", 2, [2, 4, 6]),
("__rtruediv__", 2.0, [2, 1, 2 / 3]),
("__rtruediv__", 2.0, [2.0, 1.0, 2 / 3]),
("__rfloordiv__", 2, [2, 1, 0]),
("__rmod__", 2, [0, 0, 2]),
("__rpow__", 2, [2, 4, 8]),
Expand Down Expand Up @@ -231,7 +231,7 @@ def test_mod(left: int, right: int) -> None:
("__add__", nw.lit(1), [2, 3, 5]),
("__sub__", nw.lit(1), [0, -1, -3]),
("__mul__", nw.lit(2), [2, 4, 8]),
("__truediv__", nw.lit(2.0), [2, 1, 0.5]),
("__truediv__", nw.lit(2.0), [2.0, 1.0, 0.5]),
("__truediv__", nw.lit(1), [1, 0.5, 0.25]),
("__floordiv__", nw.lit(2), [2, 1, 0]),
("__mod__", nw.lit(3), [0, 1, 3]),
Expand All @@ -254,7 +254,7 @@ def test_arithmetic_expr_left_literal(
):
request.applymarker(pytest.mark.xfail)

data = {"a": [1.0, 2, 4]}
data = {"a": [1.0, 2.0, 4.0]}
df = nw.from_native(constructor(data))
result = df.select(getattr(lhs, attr)(nw.col("a")))
assert_equal_data(result, {"literal": expected})
Expand All @@ -266,8 +266,8 @@ def test_arithmetic_expr_left_literal(
("__add__", nw.lit(1), [2, 3, 5]),
("__sub__", nw.lit(1), [0, -1, -3]),
("__mul__", nw.lit(2), [2, 4, 8]),
("__truediv__", nw.lit(2.0), [2, 1, 0.5]),
("__truediv__", nw.lit(1), [1, 0.5, 0.25]),
("__truediv__", nw.lit(2.0), [2.0, 1.0, 0.5]),
("__truediv__", nw.lit(1), [1.0, 0.5, 0.25]),
("__floordiv__", nw.lit(2), [2, 1, 0]),
("__mod__", nw.lit(3), [0, 1, 3]),
("__pow__", nw.lit(2), [2, 4, 16]),
Expand All @@ -285,7 +285,7 @@ def test_arithmetic_series_left_literal(
):
request.applymarker(pytest.mark.xfail)

data = {"a": [1.0, 2, 4]}
data = {"a": [1.0, 2.0, 4.0]}
df = nw.from_native(constructor_eager(data))
result = df.select(getattr(lhs, attr)(nw.col("a")))
assert_equal_data(result, {"literal": expected})
2 changes: 1 addition & 1 deletion tests/expr_and_series/binary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -
constructor
):
request.applymarker(pytest.mark.xfail)
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df_raw = constructor(data)
result = nw.from_native(df_raw).with_columns(
a=(1 + 3 * nw.col("a")) * (1 / nw.col("a")),
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/double_selected_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


def test_double_selected(constructor: Constructor) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7, 8, 9]}
df = nw.from_native(constructor(data))

result = df.select(nw.col("a", "b") * 2)
Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/double_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@


def test_double(constructor: Constructor) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df = nw.from_native(constructor(data))
result = df.with_columns(nw.all() * 2)
expected = {"a": [2, 6, 4], "b": [8, 8, 12], "z": [14.0, 16.0, 18.0]}
assert_equal_data(result, expected)


def test_double_alias(constructor: Constructor) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df = nw.from_native(constructor(data))
result = df.with_columns(nw.col("a").alias("o"), nw.all() * 2)
expected = {
Expand Down
8 changes: 4 additions & 4 deletions tests/expr_and_series/fill_null_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) ->
if "pyspark" in str(constructor):
request.applymarker(pytest.mark.xfail)
data = {
"a": [0.0, None, 2, 3, 4],
"b": [1.0, None, None, 5, 3],
"c": [5.0, None, 3, 2, 1],
"a": [0.0, None, 2.0, 3.0, 4.0],
"b": [1.0, None, None, 5.0, 3.0],
"c": [5.0, None, 3.0, 2.0, 1.0],
}
df = nw.from_native(constructor(data))

Expand All @@ -33,7 +33,7 @@ def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) ->

def test_fill_null_exceptions(constructor: Constructor) -> None:
data = {
"a": [0.0, None, 2, 3, 4],
"a": [0.0, None, 2.0, 3.0, 4.0],
}
df = nw.from_native(constructor(data))

Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/is_duplicated_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def test_is_duplicated_expr(
constructor: Constructor, request: pytest.FixtureRequest
) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
if "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]}
df = nw.from_native(constructor(data))
Expand All @@ -23,7 +23,7 @@ def test_is_duplicated_expr(
def test_is_duplicated_w_nulls_expr(
constructor: Constructor, request: pytest.FixtureRequest
) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
if "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]}
df = nw.from_native(constructor(data))
Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/is_unique_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
if "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
data = {
"a": [1, 1, 2],
Expand All @@ -29,7 +29,7 @@ def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest
def test_is_unique_w_nulls_expr(
constructor: Constructor, request: pytest.FixtureRequest
) -> None:
if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
if "duckdb" in str(constructor):
request.applymarker(pytest.mark.xfail)
data = {
"a": [None, 1, 2],
Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/lit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_lit(
) -> None:
if "pyspark" in str(constructor) and dtype is not None:
request.applymarker(pytest.mark.xfail)
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df_raw = constructor(data)
df = nw.from_native(df_raw).lazy()
result = df.with_columns(nw.lit(2, dtype).alias("lit"))
Expand All @@ -43,7 +43,7 @@ def test_lit(


def test_lit_error(constructor: Constructor) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
df_raw = constructor(data)
df = nw.from_native(df_raw).lazy()
with pytest.raises(
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/max_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}


@pytest.mark.parametrize("expr", [nw.col("a", "b", "z").max(), nw.max("a", "b", "z")])
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/mean_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 7], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 7], "z": [7.0, 8.0, 9.0]}


@pytest.mark.parametrize("expr", [nw.col("a", "b", "z").mean(), nw.mean("a", "b", "z")])
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/median_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
data = {
"a": [3, 8, 2, None],
"b": [5, 5, None, 7],
"z": [7.0, 8, 9, None],
"z": [7.0, 8.0, 9.0, None],
"s": ["f", "a", "x", "x"],
}

Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/min_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}


@pytest.mark.parametrize("expr", [nw.col("a", "b", "z").min(), nw.min("a", "b", "z")])
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/n_unique_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

data = {
"a": [1.0, None, None, 3.0],
"b": [1.0, None, 4, 5.0],
"b": [1.0, None, 4.0, 5.0],
}


Expand Down
4 changes: 2 additions & 2 deletions tests/expr_and_series/nth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
from tests.utils import Constructor
from tests.utils import assert_equal_data

data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}


@pytest.mark.parametrize(
("idx", "expected"),
[
(0, {"a": [1, 3, 2]}),
([0, 1], {"a": [1, 3, 2], "b": [4, 4, 6]}),
([0, 2], {"a": [1, 3, 2], "z": [7.1, 8, 9]}),
([0, 2], {"a": [1, 3, 2], "z": [7.1, 8.0, 9.0]}),
],
)
def test_nth(
Expand Down
2 changes: 1 addition & 1 deletion tests/expr_and_series/null_count_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

data = {
"a": [1.0, None, None, 3.0],
"b": [1.0, None, 4, 5.0],
"b": [1.0, None, 4.0, 5.0],
}


Expand Down
Loading

0 comments on commit 82da089

Please sign in to comment.