test: decouple pyspark constructor from pandas (#1834)

narwhals-dev · Jan 20, 2025 · 82da089 · 82da089
1 parent 973b499
commit 82da089
Show file tree

Hide file tree

Showing 54 changed files with 140 additions and 128 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from copy import deepcopy
 from typing import TYPE_CHECKING
 from typing import Any
 from typing import Callable
@@ -12,6 +13,8 @@
 import pyarrow as pa
 import pytest
 
+from narwhals.utils import generate_temporary_column_name
+
 if TYPE_CHECKING:
     import duckdb
 
@@ -67,64 +70,66 @@ def pytest_collection_modifyitems(
             item.add_marker(skip_slow)
 
 
-def pandas_constructor(obj: Any) -> IntoDataFrame:
+def pandas_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
     return pd.DataFrame(obj)  # type: ignore[no-any-return]
 
 
-def pandas_nullable_constructor(obj: Any) -> IntoDataFrame:
+def pandas_nullable_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
     return pd.DataFrame(obj).convert_dtypes(dtype_backend="numpy_nullable")  # type: ignore[no-any-return]
 
 
-def pandas_pyarrow_constructor(obj: Any) -> IntoDataFrame:
+def pandas_pyarrow_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
     return pd.DataFrame(obj).convert_dtypes(dtype_backend="pyarrow")  # type: ignore[no-any-return]
 
 
-def modin_constructor(obj: Any) -> IntoDataFrame:  # pragma: no cover
+def modin_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:  # pragma: no cover
     import modin.pandas as mpd
 
     return mpd.DataFrame(pd.DataFrame(obj))  # type: ignore[no-any-return]
 
 
-def modin_pyarrow_constructor(obj: Any) -> IntoDataFrame:  # pragma: no cover
+def modin_pyarrow_constructor(
+    obj: dict[str, list[Any]],
+) -> IntoDataFrame:  # pragma: no cover
     import modin.pandas as mpd
 
     return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow")  # type: ignore[no-any-return]
 
 
-def cudf_constructor(obj: Any) -> IntoDataFrame:  # pragma: no cover
+def cudf_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:  # pragma: no cover
     import cudf
 
     return cudf.DataFrame(obj)  # type: ignore[no-any-return]
 
 
-def polars_eager_constructor(obj: Any) -> IntoDataFrame:
+def polars_eager_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
     return pl.DataFrame(obj)
 
 
-def polars_lazy_constructor(obj: Any) -> pl.LazyFrame:
+def polars_lazy_constructor(obj: dict[str, list[Any]]) -> pl.LazyFrame:
     return pl.LazyFrame(obj)
 
 
-def duckdb_lazy_constructor(obj: Any) -> duckdb.DuckDBPyRelation:
+def duckdb_lazy_constructor(obj: dict[str, list[Any]]) -> duckdb.DuckDBPyRelation:
     import duckdb
 
     _df = pl.LazyFrame(obj)
     return duckdb.table("_df")
 
 
-def dask_lazy_p1_constructor(obj: Any) -> IntoFrame:  # pragma: no cover
+def dask_lazy_p1_constructor(obj: dict[str, list[Any]]) -> IntoFrame:  # pragma: no cover
     import dask.dataframe as dd
 
     return dd.from_dict(obj, npartitions=1)  # type: ignore[no-any-return]
 
 
-def dask_lazy_p2_constructor(obj: Any) -> IntoFrame:  # pragma: no cover
+def dask_lazy_p2_constructor(obj: dict[str, list[Any]]) -> IntoFrame:  # pragma: no cover
     import dask.dataframe as dd
 
     return dd.from_dict(obj, npartitions=2)  # type: ignore[no-any-return]
 
 
-def pyarrow_table_constructor(obj: Any) -> IntoDataFrame:
+def pyarrow_table_constructor(obj: dict[str, list[Any]]) -> IntoDataFrame:
     return pa.table(obj)  # type: ignore[no-any-return]
 
 
@@ -159,13 +164,16 @@ def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]:  # pragma: no cove
 
         register(session.stop)
 
-        def _constructor(obj: Any) -> IntoFrame:
-            pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index()
+        def _constructor(obj: dict[str, list[Any]]) -> IntoFrame:
+            _obj = deepcopy(obj)
+            index_col_name = generate_temporary_column_name(n_bytes=8, columns=list(_obj))
+            _obj[index_col_name] = list(range(len(_obj[next(iter(_obj))])))
+
             return (  # type: ignore[no-any-return]
-                session.createDataFrame(pd_df)
+                session.createDataFrame([*zip(*_obj.values())], schema=[*_obj.keys()])
                 .repartition(2)
-                .orderBy("index")
-                .drop("index")
+                .orderBy(index_col_name)
+                .drop(index_col_name)
             )
 
         return _constructor

diff --git a/tests/expr_and_series/arg_max_test.py b/tests/expr_and_series/arg_max_test.py
@@ -6,7 +6,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "i": [3, 1, 5]}
+data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0], "i": [3, 1, 5]}
 
 
 def test_expr_arg_max_expr(

diff --git a/tests/expr_and_series/arg_min_test.py b/tests/expr_and_series/arg_min_test.py
@@ -6,7 +6,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
 
 
 def test_expr_arg_min_expr(

diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py
@@ -45,7 +45,7 @@ def test_arithmetic_expr(
     ):
         request.applymarker(pytest.mark.xfail)
 
-    data = {"a": [1.0, 2, 3]}
+    data = {"a": [1.0, 2.0, 3.0]}
     df = nw.from_native(constructor(data))
     result = df.select(getattr(nw.col("a"), attr)(rhs))
     assert_equal_data(result, {"a": expected})
@@ -57,7 +57,7 @@ def test_arithmetic_expr(
         ("__radd__", 1, [2, 3, 4]),
         ("__rsub__", 1, [0, -1, -2]),
         ("__rmul__", 2, [2, 4, 6]),
-        ("__rtruediv__", 2.0, [2, 1, 2 / 3]),
+        ("__rtruediv__", 2.0, [2.0, 1.0, 2 / 3]),
         ("__rfloordiv__", 2, [2, 1, 0]),
         ("__rmod__", 2, [0, 0, 2]),
         ("__rpow__", 2, [2, 4, 8]),
@@ -119,7 +119,7 @@ def test_arithmetic_series(
         ("__radd__", 1, [2, 3, 4]),
         ("__rsub__", 1, [0, -1, -2]),
         ("__rmul__", 2, [2, 4, 6]),
-        ("__rtruediv__", 2.0, [2, 1, 2 / 3]),
+        ("__rtruediv__", 2.0, [2.0, 1.0, 2 / 3]),
         ("__rfloordiv__", 2, [2, 1, 0]),
         ("__rmod__", 2, [0, 0, 2]),
         ("__rpow__", 2, [2, 4, 8]),
@@ -231,7 +231,7 @@ def test_mod(left: int, right: int) -> None:
         ("__add__", nw.lit(1), [2, 3, 5]),
         ("__sub__", nw.lit(1), [0, -1, -3]),
         ("__mul__", nw.lit(2), [2, 4, 8]),
-        ("__truediv__", nw.lit(2.0), [2, 1, 0.5]),
+        ("__truediv__", nw.lit(2.0), [2.0, 1.0, 0.5]),
         ("__truediv__", nw.lit(1), [1, 0.5, 0.25]),
         ("__floordiv__", nw.lit(2), [2, 1, 0]),
         ("__mod__", nw.lit(3), [0, 1, 3]),
@@ -254,7 +254,7 @@ def test_arithmetic_expr_left_literal(
     ):
         request.applymarker(pytest.mark.xfail)
 
-    data = {"a": [1.0, 2, 4]}
+    data = {"a": [1.0, 2.0, 4.0]}
     df = nw.from_native(constructor(data))
     result = df.select(getattr(lhs, attr)(nw.col("a")))
     assert_equal_data(result, {"literal": expected})
@@ -266,8 +266,8 @@ def test_arithmetic_expr_left_literal(
         ("__add__", nw.lit(1), [2, 3, 5]),
         ("__sub__", nw.lit(1), [0, -1, -3]),
         ("__mul__", nw.lit(2), [2, 4, 8]),
-        ("__truediv__", nw.lit(2.0), [2, 1, 0.5]),
-        ("__truediv__", nw.lit(1), [1, 0.5, 0.25]),
+        ("__truediv__", nw.lit(2.0), [2.0, 1.0, 0.5]),
+        ("__truediv__", nw.lit(1), [1.0, 0.5, 0.25]),
         ("__floordiv__", nw.lit(2), [2, 1, 0]),
         ("__mod__", nw.lit(3), [0, 1, 3]),
         ("__pow__", nw.lit(2), [2, 4, 16]),
@@ -285,7 +285,7 @@ def test_arithmetic_series_left_literal(
     ):
         request.applymarker(pytest.mark.xfail)
 
-    data = {"a": [1.0, 2, 4]}
+    data = {"a": [1.0, 2.0, 4.0]}
     df = nw.from_native(constructor_eager(data))
     result = df.select(getattr(lhs, attr)(nw.col("a")))
     assert_equal_data(result, {"literal": expected})
diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py
@@ -13,7 +13,7 @@ def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -
         constructor
     ):
         request.applymarker(pytest.mark.xfail)
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df_raw = constructor(data)
     result = nw.from_native(df_raw).with_columns(
         a=(1 + 3 * nw.col("a")) * (1 / nw.col("a")),

diff --git a/tests/expr_and_series/double_selected_test.py b/tests/expr_and_series/double_selected_test.py
@@ -6,7 +6,7 @@
 
 
 def test_double_selected(constructor: Constructor) -> None:
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7, 8, 9]}
     df = nw.from_native(constructor(data))
 
     result = df.select(nw.col("a", "b") * 2)

diff --git a/tests/expr_and_series/double_test.py b/tests/expr_and_series/double_test.py
@@ -6,15 +6,15 @@
 
 
 def test_double(constructor: Constructor) -> None:
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df = nw.from_native(constructor(data))
     result = df.with_columns(nw.all() * 2)
     expected = {"a": [2, 6, 4], "b": [8, 8, 12], "z": [14.0, 16.0, 18.0]}
     assert_equal_data(result, expected)
 
 
 def test_double_alias(constructor: Constructor) -> None:
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df = nw.from_native(constructor(data))
     result = df.with_columns(nw.col("a").alias("o"), nw.all() * 2)
     expected = {

diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py
@@ -16,9 +16,9 @@ def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) ->
     if "pyspark" in str(constructor):
         request.applymarker(pytest.mark.xfail)
     data = {
-        "a": [0.0, None, 2, 3, 4],
-        "b": [1.0, None, None, 5, 3],
-        "c": [5.0, None, 3, 2, 1],
+        "a": [0.0, None, 2.0, 3.0, 4.0],
+        "b": [1.0, None, None, 5.0, 3.0],
+        "c": [5.0, None, 3.0, 2.0, 1.0],
     }
     df = nw.from_native(constructor(data))
 
@@ -33,7 +33,7 @@ def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) ->
 
 def test_fill_null_exceptions(constructor: Constructor) -> None:
     data = {
-        "a": [0.0, None, 2, 3, 4],
+        "a": [0.0, None, 2.0, 3.0, 4.0],
     }
     df = nw.from_native(constructor(data))
 

diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py
@@ -11,7 +11,7 @@
 def test_is_duplicated_expr(
     constructor: Constructor, request: pytest.FixtureRequest
 ) -> None:
-    if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
+    if "duckdb" in str(constructor):
         request.applymarker(pytest.mark.xfail)
     data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]}
     df = nw.from_native(constructor(data))
@@ -23,7 +23,7 @@ def test_is_duplicated_expr(
 def test_is_duplicated_w_nulls_expr(
     constructor: Constructor, request: pytest.FixtureRequest
 ) -> None:
-    if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
+    if "duckdb" in str(constructor):
         request.applymarker(pytest.mark.xfail)
     data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]}
     df = nw.from_native(constructor(data))

diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py
@@ -9,7 +9,7 @@
 
 
 def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None:
-    if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
+    if "duckdb" in str(constructor):
         request.applymarker(pytest.mark.xfail)
     data = {
         "a": [1, 1, 2],
@@ -29,7 +29,7 @@ def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest
 def test_is_unique_w_nulls_expr(
     constructor: Constructor, request: pytest.FixtureRequest
 ) -> None:
-    if ("pyspark" in str(constructor)) or "duckdb" in str(constructor):
+    if "duckdb" in str(constructor):
         request.applymarker(pytest.mark.xfail)
     data = {
         "a": [None, 1, 2],

diff --git a/tests/expr_and_series/lit_test.py b/tests/expr_and_series/lit_test.py
@@ -29,7 +29,7 @@ def test_lit(
 ) -> None:
     if "pyspark" in str(constructor) and dtype is not None:
         request.applymarker(pytest.mark.xfail)
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df_raw = constructor(data)
     df = nw.from_native(df_raw).lazy()
     result = df.with_columns(nw.lit(2, dtype).alias("lit"))
@@ -43,7 +43,7 @@ def test_lit(
 
 
 def test_lit_error(constructor: Constructor) -> None:
-    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+    data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
     df_raw = constructor(data)
     df = nw.from_native(df_raw).lazy()
     with pytest.raises(

diff --git a/tests/expr_and_series/max_test.py b/tests/expr_and_series/max_test.py
@@ -7,7 +7,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").max(), nw.max("a", "b", "z")])

diff --git a/tests/expr_and_series/mean_test.py b/tests/expr_and_series/mean_test.py
@@ -7,7 +7,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 7], "z": [7.0, 8, 9]}
+data = {"a": [1, 3, 2], "b": [4, 4, 7], "z": [7.0, 8.0, 9.0]}
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").mean(), nw.mean("a", "b", "z")])

diff --git a/tests/expr_and_series/median_test.py b/tests/expr_and_series/median_test.py
@@ -11,7 +11,7 @@
 data = {
     "a": [3, 8, 2, None],
     "b": [5, 5, None, 7],
-    "z": [7.0, 8, 9, None],
+    "z": [7.0, 8.0, 9.0, None],
     "s": ["f", "a", "x", "x"],
 }
 

diff --git a/tests/expr_and_series/min_test.py b/tests/expr_and_series/min_test.py
@@ -7,7 +7,7 @@
 from tests.utils import ConstructorEager
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
+data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8.0, 9.0]}
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").min(), nw.min("a", "b", "z")])

diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py
@@ -7,7 +7,7 @@
 
 data = {
     "a": [1.0, None, None, 3.0],
-    "b": [1.0, None, 4, 5.0],
+    "b": [1.0, None, 4.0, 5.0],
 }
 
 

diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py
@@ -8,15 +8,15 @@
 from tests.utils import Constructor
 from tests.utils import assert_equal_data
 
-data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}
+data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8.0, 9.0]}
 
 
 @pytest.mark.parametrize(
     ("idx", "expected"),
     [
         (0, {"a": [1, 3, 2]}),
         ([0, 1], {"a": [1, 3, 2], "b": [4, 4, 6]}),
-        ([0, 2], {"a": [1, 3, 2], "z": [7.1, 8, 9]}),
+        ([0, 2], {"a": [1, 3, 2], "z": [7.1, 8.0, 9.0]}),
     ],
 )
 def test_nth(

diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py
@@ -7,7 +7,7 @@
 
 data = {
     "a": [1.0, None, None, 3.0],
-    "b": [1.0, None, 4, 5.0],
+    "b": [1.0, None, 4.0, 5.0],
 }