From f6631e6a42e77690f10adb9d28365a0ba6568499 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Wed, 22 Jan 2025 20:10:18 +0100 Subject: [PATCH 1/8] feat: pyspark and duckdb selectors --- narwhals/_duckdb/namespace.py | 7 + narwhals/_duckdb/selectors.py | 225 ++++++++++++++++++++++++++++++ narwhals/_spark_like/namespace.py | 7 + narwhals/_spark_like/selectors.py | 206 +++++++++++++++++++++++++++ tests/selectors_test.py | 20 +-- 5 files changed, 451 insertions(+), 14 deletions(-) create mode 100644 narwhals/_duckdb/selectors.py create mode 100644 narwhals/_spark_like/selectors.py diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index bbe6d9611..42668919b 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -17,6 +17,7 @@ from duckdb import FunctionExpression from narwhals._duckdb.expr import DuckDBExpr +from narwhals._duckdb.selectors import DuckDBSelectorNamespace from narwhals._duckdb.utils import narwhals_to_native_dtype from narwhals._expression_parsing import combine_root_names from narwhals._expression_parsing import parse_into_exprs @@ -44,6 +45,12 @@ def __init__( self._backend_version = backend_version self._version = version + @property + def selectors(self: Self) -> DuckDBSelectorNamespace: + return DuckDBSelectorNamespace( + backend_version=self._backend_version, version=self._version + ) + def all(self: Self) -> DuckDBExpr: def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]: return [ColumnExpression(col_name) for col_name in df.columns] diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py new file mode 100644 index 000000000..a352c5757 --- /dev/null +++ b/narwhals/_duckdb/selectors.py @@ -0,0 +1,225 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import NoReturn + +from duckdb import ColumnExpression +from duckdb import Expression + +from narwhals._duckdb.expr import DuckDBExpr +from narwhals._duckdb.utils import get_column_name +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from pyspark.sql import Column + from typing_extensions import Self + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals.dtypes import DType + from narwhals.utils import Version + + +class DuckDBSelectorNamespace: + def __init__( + self: Self, *, backend_version: tuple[int, ...], version: Version + ) -> None: + self._backend_version = backend_version + self._version = version + + def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DuckDBSelector: + def func(df: DuckDBLazyFrame) -> list[Expression]: + return [ + ColumnExpression(col) for col in df.columns if df.schema[col] in dtypes + ] + + return DuckDBSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + version=self._version, + kwargs={}, + ) + + def numeric(self: Self) -> DuckDBSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype( + [ + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + dtypes.Float64, + dtypes.Float32, + ], + ) + + def categorical(self: Self) -> DuckDBSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.Categorical]) + + def string(self: Self) -> DuckDBSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.String]) + + def boolean(self: Self) -> DuckDBSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.Boolean]) + + def all(self: Self) -> DuckDBSelector: + def func(df: DuckDBLazyFrame) -> list[Any]: + return [ColumnExpression(col) for col in df.columns] + + return DuckDBSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + version=self._version, + kwargs={}, + ) + + +class DuckDBSelector(DuckDBExpr): + def __repr__(self: Self) -> str: # pragma: no cover + return ( + f"DuckDBSelector(" + f"depth={self._depth}, " + f"function_name={self._function_name}, " + f"root_names={self._root_names}, " + f"output_names={self._output_names}" + ) + + def _to_expr(self: Self) -> DuckDBExpr: + return DuckDBExpr( + self._call, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=self._output_names, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + + def __sub__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: + if isinstance(other, DuckDBSelector): + + def call(df: DuckDBLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [ + get_column_name(df, x, returns_scalar=self._returns_scalar) + for x in lhs + ] + rhs_names = { + get_column_name(df, x, returns_scalar=other._returns_scalar) + for x in rhs + } + return [col for col, name in zip(lhs, lhs_names) if name not in rhs_names] + + return DuckDBSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() - other + + def __or__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: + if isinstance(other, DuckDBSelector): + + def call(df: DuckDBLazyFrame) -> list[Column]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [ + get_column_name(df, x, returns_scalar=self._returns_scalar) + for x in lhs + ] + rhs_names = [ + get_column_name(df, x, returns_scalar=other._returns_scalar) + for x in rhs + ] + return [ + *(col for col, name in zip(lhs, lhs_names) if name not in rhs_names), + *rhs, + ] + + return DuckDBSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() | other + + def __and__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: + if isinstance(other, DuckDBSelector): + + def call(df: DuckDBLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [ + get_column_name(df, x, returns_scalar=self._returns_scalar) + for x in lhs + ] + rhs_names = { + get_column_name(df, x, returns_scalar=other._returns_scalar) + for x in rhs + } + return [col for col, name in zip(lhs, lhs_names) if name in rhs_names] + + return DuckDBSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() & other + + def __invert__(self: Self) -> DuckDBSelector: + return ( + DuckDBSelectorNamespace( + backend_version=self._backend_version, version=self._version + ).all() + - self + ) + + def __rsub__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __rand__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __ror__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index 029f77d06..49899cc3a 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -15,6 +15,7 @@ from narwhals._expression_parsing import reduce_output_names from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals._spark_like.expr import SparkLikeExpr +from narwhals._spark_like.selectors import SparkLikeSelectorNamespace from narwhals._spark_like.utils import get_column_name from narwhals.typing import CompliantNamespace @@ -35,6 +36,12 @@ def __init__( self._backend_version = backend_version self._version = version + @property + def selectors(self: Self) -> SparkLikeSelectorNamespace: + return SparkLikeSelectorNamespace( + backend_version=self._backend_version, version=self._version + ) + def all(self: Self) -> SparkLikeExpr: def _all(df: SparkLikeLazyFrame) -> list[Column]: import pyspark.sql.functions as F # noqa: N812 diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py new file mode 100644 index 000000000..a13561fa3 --- /dev/null +++ b/narwhals/_spark_like/selectors.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import NoReturn + +from pyspark.sql import functions as F # noqa: N812 + +from narwhals._spark_like.expr import SparkLikeExpr +from narwhals._spark_like.utils import get_column_name +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from pyspark.sql import Column + from typing_extensions import Self + + from narwhals._spark_like.dataframe import SparkLikeLazyFrame + from narwhals.dtypes import DType + from narwhals.utils import Version + + +class SparkLikeSelectorNamespace: + def __init__( + self: Self, *, backend_version: tuple[int, ...], version: Version + ) -> None: + self._backend_version = backend_version + self._version = version + + def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> SparkLikeSelector: + def func(df: SparkLikeLazyFrame) -> list[Any]: + return [ + df._native_frame[col] for col in df.columns if df.schema[col] in dtypes + ] + + return SparkLikeSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + version=self._version, + kwargs={}, + ) + + def numeric(self: Self) -> SparkLikeSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype( + [ + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + dtypes.Float64, + dtypes.Float32, + ], + ) + + def categorical(self: Self) -> SparkLikeSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.Categorical]) + + def string(self: Self) -> SparkLikeSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.String]) + + def boolean(self: Self) -> SparkLikeSelector: + dtypes = import_dtypes_module(self._version) + return self.by_dtype([dtypes.Boolean]) + + def all(self: Self) -> SparkLikeSelector: + def func(df: SparkLikeLazyFrame) -> list[Any]: + return [df._native_frame[col] for col in df.columns] + + return SparkLikeSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + version=self._version, + kwargs={}, + ) + + +class SparkLikeSelector(SparkLikeExpr): + def __repr__(self: Self) -> str: # pragma: no cover + return ( + f"SparkLikeSelector(" + f"depth={self._depth}, " + f"function_name={self._function_name}, " + f"root_names={self._root_names}, " + f"output_names={self._output_names}" + ) + + def _to_expr(self: Self) -> SparkLikeExpr: + return SparkLikeExpr( + self._call, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=self._output_names, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + + def __sub__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: + if isinstance(other, SparkLikeSelector): + + def call(df: SparkLikeLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [get_column_name(df, x) for x in lhs] + rhs_names = {get_column_name(df, x) for x in rhs} + return [col for col, name in zip(lhs, lhs_names) if name not in rhs_names] + + return SparkLikeSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() - F.lit(other) + + def __or__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: + if isinstance(other, SparkLikeSelector): + + def call(df: SparkLikeLazyFrame) -> list[Column]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [get_column_name(df, x) for x in lhs] + rhs_names = [get_column_name(df, x) for x in rhs] + return [ + *(col for col, name in zip(lhs, lhs_names) if name not in rhs_names), + *rhs, + ] + + return SparkLikeSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() | F.lit(other) + + def __and__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: + if isinstance(other, SparkLikeSelector): + + def call(df: SparkLikeLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + lhs_names = [get_column_name(df, x) for x in lhs] + rhs_names = {get_column_name(df, x) for x in rhs} + return [col for col, name in zip(lhs, lhs_names) if name in rhs_names] + + return SparkLikeSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + version=self._version, + kwargs={}, + ) + else: + return self._to_expr() & F.lit(other) + + def __invert__(self: Self) -> SparkLikeSelector: + return ( + SparkLikeSelectorNamespace( + backend_version=self._backend_version, version=self._version + ).all() + - self + ) + + def __rsub__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __rand__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __ror__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 80aa64803..4a22d2cd4 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -23,36 +23,28 @@ } -def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_selectors(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_numeric(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_boolean(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) -def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_string(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(string()) expected = {"b": ["a", "b", "c"]} @@ -67,7 +59,7 @@ def test_categorical( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) expected = {"b": ["a", "b", "c"]} @@ -96,7 +88,7 @@ def test_set_ops( expected: list[str], request: pytest.FixtureRequest, ) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "duckdb" in str(constructor) and not expected: request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() From 782c5b43bf440faccef4033369dfd3108ca41e46 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Wed, 22 Jan 2025 20:13:01 +0100 Subject: [PATCH 2/8] rm categorical --- narwhals/_duckdb/selectors.py | 4 ---- narwhals/_spark_like/selectors.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index a352c5757..453c3c6ab 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -62,10 +62,6 @@ def numeric(self: Self) -> DuckDBSelector: ], ) - def categorical(self: Self) -> DuckDBSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype([dtypes.Categorical]) - def string(self: Self) -> DuckDBSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.String]) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index a13561fa3..266ceaa9f 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -61,10 +61,6 @@ def numeric(self: Self) -> SparkLikeSelector: ], ) - def categorical(self: Self) -> SparkLikeSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype([dtypes.Categorical]) - def string(self: Self) -> SparkLikeSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.String]) From 1d2fd929cb1d182d061e78e8401223257ab0c0cd Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 23 Jan 2025 08:53:00 +0100 Subject: [PATCH 3/8] fixup --- narwhals/_duckdb/selectors.py | 12 ++---------- narwhals/_spark_like/selectors.py | 10 ---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 453c3c6ab..28c03088c 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import NoReturn from duckdb import ColumnExpression from duckdb import Expression @@ -49,10 +48,12 @@ def numeric(self: Self) -> DuckDBSelector: dtypes = import_dtypes_module(self._version) return self.by_dtype( [ + dtypes.Int128, dtypes.Int64, dtypes.Int32, dtypes.Int16, dtypes.Int8, + dtypes.UInt128, dtypes.UInt64, dtypes.UInt32, dtypes.UInt16, @@ -210,12 +211,3 @@ def __invert__(self: Self) -> DuckDBSelector: ).all() - self ) - - def __rsub__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError - - def __rand__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError - - def __ror__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 266ceaa9f..42d4619b2 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -2,7 +2,6 @@ from typing import TYPE_CHECKING from typing import Any -from typing import NoReturn from pyspark.sql import functions as F # noqa: N812 @@ -191,12 +190,3 @@ def __invert__(self: Self) -> SparkLikeSelector: ).all() - self ) - - def __rsub__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError - - def __rand__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError - - def __ror__(self: Self, other: Any) -> NoReturn: - raise NotImplementedError From e0241761bb680ecc20cccf3f55a178c57dad6beb Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 26 Jan 2025 16:01:21 +0000 Subject: [PATCH 4/8] pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d283f4d59..1cc6b4dd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -168,6 +168,8 @@ filterwarnings = [ 'ignore:.*The distutils package is deprecated and slated for removal in Python 3.12:DeprecationWarning:pyspark', 'ignore:.*distutils Version classes are deprecated. Use packaging.version instead.*:DeprecationWarning:pyspark', 'ignore:.*is_datetime64tz_dtype is deprecated and will be removed in a future version.*:DeprecationWarning:pyspark', + # Warning raised by PyArrow nightly just by importing pandas + 'ignore:.*Python binding for RankQuantileOptions not exposed:RuntimeWarning:pandas' ] xfail_strict = true From 7e26f9df3ff234ddd805d2decde72c006ddf1d5e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 26 Jan 2025 16:03:59 +0000 Subject: [PATCH 5/8] warning --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1cc6b4dd0..dbf21809e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,7 @@ filterwarnings = [ 'ignore:.*distutils Version classes are deprecated. Use packaging.version instead.*:DeprecationWarning:pyspark', 'ignore:.*is_datetime64tz_dtype is deprecated and will be removed in a future version.*:DeprecationWarning:pyspark', # Warning raised by PyArrow nightly just by importing pandas - 'ignore:.*Python binding for RankQuantileOptions not exposed:RuntimeWarning:pandas' + 'ignore:.*Python binding for RankQuantileOptions not exposed:RuntimeWarning:pyarrow' ] xfail_strict = true From 68b3a4473ba642ff8258d04c4d5efaf015309d50 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 26 Jan 2025 16:20:37 +0000 Subject: [PATCH 6/8] fixup spark --- narwhals/_spark_like/selectors.py | 12 +++++++++--- tests/selectors_test.py | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index a4f52a1df..911457bd2 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -139,7 +139,9 @@ def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: kwargs={}, ) else: - return self._to_expr() - other + return self._to_expr() - ( + other if isinstance(other, SparkLikeExpr) else F.lit(other) + ) def __or__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: if isinstance(other, SparkLikeSelector): @@ -171,7 +173,9 @@ def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: kwargs={}, ) else: - return self._to_expr() | other + return self._to_expr() | ( + other if isinstance(other, SparkLikeExpr) else F.lit(other) + ) def __and__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: if isinstance(other, SparkLikeSelector): @@ -199,7 +203,9 @@ def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: kwargs={}, ) else: - return self._to_expr() & other + return self._to_expr() & ( + other if isinstance(other, SparkLikeExpr) else F.lit(other) + ) def __invert__(self: Self) -> SparkLikeSelector: return ( diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 32025fdda..968249453 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -79,6 +79,7 @@ def test_categorical( (boolean() & True, ["d"]), (boolean() | True, ["d"]), (numeric() - 1, ["a", "c"]), + (numeric() - nw.col("a"), ["a", "c"]), (all(), ["a", "b", "c", "d"]), ], ) From 7e8a83044a0ef18b73a904ee9fb60fc6aa672272 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 26 Jan 2025 16:39:52 +0000 Subject: [PATCH 7/8] polars version fixup --- tests/selectors_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 968249453..99740d227 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -11,6 +11,7 @@ from narwhals.stable.v1.selectors import categorical from narwhals.stable.v1.selectors import numeric from narwhals.stable.v1.selectors import string +from tests.utils import POLARS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -79,7 +80,6 @@ def test_categorical( (boolean() & True, ["d"]), (boolean() | True, ["d"]), (numeric() - 1, ["a", "c"]), - (numeric() - nw.col("a"), ["a", "c"]), (all(), ["a", "b", "c", "d"]), ], ) @@ -96,6 +96,21 @@ def test_set_ops( assert sorted(result) == expected +def test_subtract_expr( + constructor: Constructor, + request: pytest.FixtureRequest, +) -> None: + if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 27): + # In old Polars versions, cs.numeric() - col('a') + # would exclude column 'a' from the result, as opposed to + # subtracting it. + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) + result = df.select(numeric() - nw.col("a")) + expected = {"a": [0, 0, 0], "c": [3.1, 4.0, 4.0]} + assert_equal_data(result, expected) + + def test_set_ops_invalid(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) with pytest.raises((NotImplementedError, ValueError)): From 4ffebeab0fb219c6930cbcc67991dfd0b34c19c6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 26 Jan 2025 16:56:09 +0000 Subject: [PATCH 8/8] dont cover categorical for duckdb --- narwhals/_duckdb/selectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 77fd332fa..d72e297fa 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -65,7 +65,7 @@ def numeric(self: Self) -> DuckDBSelector: ], ) - def categorical(self: Self) -> DuckDBSelector: + def categorical(self: Self) -> DuckDBSelector: # pragma: no cover dtypes = import_dtypes_module(self._version) return self.by_dtype([dtypes.Categorical])