Skip to content

Commit 364a625

Browse files
feat: pyspark and duckdb selectors (#1853)
--------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
1 parent e2ba74b commit 364a625

File tree

9 files changed

+472
-14
lines changed

9 files changed

+472
-14
lines changed

narwhals/_arrow/selectors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,12 @@ def numeric(self: Self) -> ArrowSelector:
4747
dtypes = import_dtypes_module(self._version)
4848
return self.by_dtype(
4949
[
50+
dtypes.Int128,
5051
dtypes.Int64,
5152
dtypes.Int32,
5253
dtypes.Int16,
5354
dtypes.Int8,
55+
dtypes.UInt128,
5456
dtypes.UInt64,
5557
dtypes.UInt32,
5658
dtypes.UInt16,

narwhals/_dask/selectors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,12 @@ def numeric(self: Self) -> DaskSelector:
5252
dtypes = import_dtypes_module(self._version)
5353
return self.by_dtype(
5454
[
55+
dtypes.Int128,
5556
dtypes.Int64,
5657
dtypes.Int32,
5758
dtypes.Int16,
5859
dtypes.Int8,
60+
dtypes.UInt128,
5961
dtypes.UInt64,
6062
dtypes.UInt32,
6163
dtypes.UInt16,

narwhals/_duckdb/namespace.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from duckdb import FunctionExpression
1818

1919
from narwhals._duckdb.expr import DuckDBExpr
20+
from narwhals._duckdb.selectors import DuckDBSelectorNamespace
2021
from narwhals._duckdb.utils import narwhals_to_native_dtype
2122
from narwhals._expression_parsing import combine_alias_output_names
2223
from narwhals._expression_parsing import combine_evaluate_output_names
@@ -38,6 +39,12 @@ def __init__(
3839
self._backend_version = backend_version
3940
self._version = version
4041

42+
@property
43+
def selectors(self: Self) -> DuckDBSelectorNamespace:
44+
return DuckDBSelectorNamespace(
45+
backend_version=self._backend_version, version=self._version
46+
)
47+
4148
def all(self: Self) -> DuckDBExpr:
4249
def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
4350
return [ColumnExpression(col_name) for col_name in df.columns]

narwhals/_duckdb/selectors.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
from typing import Any
5+
from typing import Sequence
6+
7+
from duckdb import ColumnExpression
8+
9+
from narwhals._duckdb.expr import DuckDBExpr
10+
from narwhals.utils import import_dtypes_module
11+
12+
if TYPE_CHECKING:
13+
import duckdb
14+
from typing_extensions import Self
15+
16+
from narwhals._duckdb.dataframe import DuckDBLazyFrame
17+
from narwhals.dtypes import DType
18+
from narwhals.utils import Version
19+
20+
21+
class DuckDBSelectorNamespace:
22+
def __init__(
23+
self: Self, *, backend_version: tuple[int, ...], version: Version
24+
) -> None:
25+
self._backend_version = backend_version
26+
self._version = version
27+
28+
def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DuckDBSelector:
29+
def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
30+
return [
31+
ColumnExpression(col) for col in df.columns if df.schema[col] in dtypes
32+
]
33+
34+
def evalute_output_names(df: DuckDBLazyFrame) -> Sequence[str]:
35+
return [col for col in df.columns if df.schema[col] in dtypes]
36+
37+
return DuckDBSelector(
38+
func,
39+
depth=0,
40+
function_name="selector",
41+
evaluate_output_names=evalute_output_names,
42+
alias_output_names=None,
43+
backend_version=self._backend_version,
44+
returns_scalar=False,
45+
version=self._version,
46+
kwargs={},
47+
)
48+
49+
def numeric(self: Self) -> DuckDBSelector:
50+
dtypes = import_dtypes_module(self._version)
51+
return self.by_dtype(
52+
[
53+
dtypes.Int128,
54+
dtypes.Int64,
55+
dtypes.Int32,
56+
dtypes.Int16,
57+
dtypes.Int8,
58+
dtypes.UInt128,
59+
dtypes.UInt64,
60+
dtypes.UInt32,
61+
dtypes.UInt16,
62+
dtypes.UInt8,
63+
dtypes.Float64,
64+
dtypes.Float32,
65+
],
66+
)
67+
68+
def categorical(self: Self) -> DuckDBSelector: # pragma: no cover
69+
dtypes = import_dtypes_module(self._version)
70+
return self.by_dtype([dtypes.Categorical])
71+
72+
def string(self: Self) -> DuckDBSelector:
73+
dtypes = import_dtypes_module(self._version)
74+
return self.by_dtype([dtypes.String])
75+
76+
def boolean(self: Self) -> DuckDBSelector:
77+
dtypes = import_dtypes_module(self._version)
78+
return self.by_dtype([dtypes.Boolean])
79+
80+
def all(self: Self) -> DuckDBSelector:
81+
def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
82+
return [ColumnExpression(col) for col in df.columns]
83+
84+
return DuckDBSelector(
85+
func,
86+
depth=0,
87+
function_name="selector",
88+
evaluate_output_names=lambda df: df.columns,
89+
alias_output_names=None,
90+
backend_version=self._backend_version,
91+
returns_scalar=False,
92+
version=self._version,
93+
kwargs={},
94+
)
95+
96+
97+
class DuckDBSelector(DuckDBExpr):
98+
def __repr__(self: Self) -> str: # pragma: no cover
99+
return (
100+
f"DuckDBSelector("
101+
f"depth={self._depth}, "
102+
f"function_name={self._function_name})"
103+
)
104+
105+
def _to_expr(self: Self) -> DuckDBExpr:
106+
return DuckDBExpr(
107+
self._call,
108+
depth=self._depth,
109+
function_name=self._function_name,
110+
evaluate_output_names=self._evaluate_output_names,
111+
alias_output_names=self._alias_output_names,
112+
backend_version=self._backend_version,
113+
returns_scalar=self._returns_scalar,
114+
version=self._version,
115+
kwargs={},
116+
)
117+
118+
def __sub__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any:
119+
if isinstance(other, DuckDBSelector):
120+
121+
def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
122+
lhs_names = self._evaluate_output_names(df)
123+
rhs_names = other._evaluate_output_names(df)
124+
lhs = self._call(df)
125+
return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names]
126+
127+
def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]:
128+
lhs_names = self._evaluate_output_names(df)
129+
rhs_names = other._evaluate_output_names(df)
130+
return [x for x in lhs_names if x not in rhs_names]
131+
132+
return DuckDBSelector(
133+
call,
134+
depth=0,
135+
function_name="selector",
136+
evaluate_output_names=evaluate_output_names,
137+
alias_output_names=None,
138+
backend_version=self._backend_version,
139+
returns_scalar=self._returns_scalar,
140+
version=self._version,
141+
kwargs={},
142+
)
143+
else:
144+
return self._to_expr() - other
145+
146+
def __or__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any:
147+
if isinstance(other, DuckDBSelector):
148+
149+
def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
150+
lhs_names = self._evaluate_output_names(df)
151+
rhs_names = other._evaluate_output_names(df)
152+
lhs = self._call(df)
153+
rhs = other._call(df)
154+
return [
155+
*(x for x, name in zip(lhs, lhs_names) if name not in rhs_names),
156+
*rhs,
157+
]
158+
159+
def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]:
160+
lhs_names = self._evaluate_output_names(df)
161+
rhs_names = other._evaluate_output_names(df)
162+
return [*(x for x in lhs_names if x not in rhs_names), *rhs_names]
163+
164+
return DuckDBSelector(
165+
call,
166+
depth=0,
167+
function_name="selector",
168+
evaluate_output_names=evaluate_output_names,
169+
alias_output_names=None,
170+
backend_version=self._backend_version,
171+
returns_scalar=self._returns_scalar,
172+
version=self._version,
173+
kwargs={},
174+
)
175+
else:
176+
return self._to_expr() | other
177+
178+
def __and__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any:
179+
if isinstance(other, DuckDBSelector):
180+
181+
def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]:
182+
lhs_names = self._evaluate_output_names(df)
183+
rhs_names = other._evaluate_output_names(df)
184+
lhs = self._call(df)
185+
return [x for x, name in zip(lhs, lhs_names) if name in rhs_names]
186+
187+
def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]:
188+
lhs_names = self._evaluate_output_names(df)
189+
rhs_names = other._evaluate_output_names(df)
190+
return [x for x in lhs_names if x in rhs_names]
191+
192+
return DuckDBSelector(
193+
call,
194+
depth=0,
195+
function_name="selector",
196+
evaluate_output_names=evaluate_output_names,
197+
alias_output_names=None,
198+
backend_version=self._backend_version,
199+
returns_scalar=self._returns_scalar,
200+
version=self._version,
201+
kwargs={},
202+
)
203+
else:
204+
return self._to_expr() & other
205+
206+
def __invert__(self: Self) -> DuckDBSelector:
207+
return (
208+
DuckDBSelectorNamespace(
209+
backend_version=self._backend_version, version=self._version
210+
).all()
211+
- self
212+
)

narwhals/_pandas_like/selectors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,12 @@ def numeric(self: Self) -> PandasSelector:
5252
dtypes = import_dtypes_module(self._version)
5353
return self.by_dtype(
5454
[
55+
dtypes.Int128,
5556
dtypes.Int64,
5657
dtypes.Int32,
5758
dtypes.Int16,
5859
dtypes.Int8,
60+
dtypes.UInt128,
5961
dtypes.UInt64,
6062
dtypes.UInt32,
6163
dtypes.UInt16,

narwhals/_spark_like/namespace.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from narwhals._expression_parsing import combine_evaluate_output_names
1717
from narwhals._spark_like.dataframe import SparkLikeLazyFrame
1818
from narwhals._spark_like.expr import SparkLikeExpr
19+
from narwhals._spark_like.selectors import SparkLikeSelectorNamespace
1920
from narwhals.typing import CompliantNamespace
2021

2122
if TYPE_CHECKING:
@@ -34,6 +35,12 @@ def __init__(
3435
self._backend_version = backend_version
3536
self._version = version
3637

38+
@property
39+
def selectors(self: Self) -> SparkLikeSelectorNamespace:
40+
return SparkLikeSelectorNamespace(
41+
backend_version=self._backend_version, version=self._version
42+
)
43+
3744
def all(self: Self) -> SparkLikeExpr:
3845
def _all(df: SparkLikeLazyFrame) -> list[Column]:
3946
return [F.col(col_name) for col_name in df.columns]

0 commit comments

Comments
 (0)