From c4dbd3f757043286b11cbef15d7e53c28a65a664 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 10 Jan 2025 16:27:46 +0000 Subject: [PATCH 01/10] restore api completeness for namespaces --- mkdocs.yml | 1 + narwhals/_pandas_like/expr.py | 204 +------------------------ utils/generate_backend_completeness.py | 2 +- 3 files changed, 4 insertions(+), 203 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 307f8a6aa..9799b56d8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -29,6 +29,7 @@ nav: - Supported DataFrame methods: api-completeness/dataframe.md - Supported LazyFrame methods: api-completeness/lazyframe.md - Supported Expr methods: api-completeness/expr.md + - Supported Expr.dt methods: api-completeness/expr_dt.md - Supported Series methods: api-completeness/series.md - API Reference: - api-reference/index.md diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c694b3420..0d42861de 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -8,6 +8,8 @@ from narwhals._expression_parsing import reuse_series_implementation from narwhals._expression_parsing import reuse_series_namespace_implementation +from narwhals._pandas_like.expr_dt import PandasLikeExprDateTimeNamespace +from narwhals._pandas_like.expr_str import PandasLikeExprStringNamespace from narwhals._pandas_like.series import PandasLikeSeries from narwhals._pandas_like.utils import rename from narwhals.dependencies import get_numpy @@ -671,208 +673,6 @@ def get_categories(self) -> PandasLikeExpr: ) -class PandasLikeExprStringNamespace: - def __init__(self, expr: PandasLikeExpr) -> None: - self._compliant_expr = expr - - def len_chars( - self, - ) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "len_chars" - ) - - def replace( - self, - pattern: str, - value: str, - *, - literal: bool = False, - n: int = 1, - ) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "replace", - pattern=pattern, - value=value, - literal=literal, - n=n, - ) - - def replace_all( - self, - pattern: str, - value: str, - *, - literal: bool = False, - ) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "replace_all", - pattern=pattern, - value=value, - literal=literal, - ) - - def strip_chars(self, characters: str | None = None) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "strip_chars", - characters=characters, - ) - - def starts_with(self, prefix: str) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "starts_with", - prefix=prefix, - ) - - def ends_with(self, suffix: str) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "ends_with", - suffix=suffix, - ) - - def contains(self, pattern: str, *, literal: bool) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "contains", - pattern=pattern, - literal=literal, - ) - - def slice(self, offset: int, length: int | None = None) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "slice", offset=offset, length=length - ) - - def to_datetime(self: Self, format: str | None) -> PandasLikeExpr: # noqa: A002 - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_datetime", - format=format, - ) - - def to_uppercase(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_uppercase", - ) - - def to_lowercase(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_lowercase", - ) - - -class PandasLikeExprDateTimeNamespace: - def __init__(self, expr: PandasLikeExpr) -> None: - self._compliant_expr = expr - - def date(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "date") - - def year(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "year") - - def month(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "month") - - def day(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "day") - - def hour(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "hour") - - def minute(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "minute") - - def second(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "second") - - def millisecond(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "millisecond" - ) - - def microsecond(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "microsecond" - ) - - def nanosecond(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "nanosecond" - ) - - def ordinal_day(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "ordinal_day" - ) - - def weekday(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "weekday" - ) - - def total_minutes(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_minutes" - ) - - def total_seconds(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_seconds" - ) - - def total_milliseconds(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_milliseconds" - ) - - def total_microseconds(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_microseconds" - ) - - def total_nanoseconds(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_nanoseconds" - ) - - def to_string(self, format: str) -> PandasLikeExpr: # noqa: A002 - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "to_string", format=format - ) - - def replace_time_zone(self, time_zone: str | None) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "replace_time_zone", time_zone=time_zone - ) - - def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone - ) - - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "timestamp", time_unit=time_unit - ) - - class PandasLikeExprNameNamespace: def __init__(self: Self, expr: PandasLikeExpr) -> None: self._compliant_expr = expr diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 397c8e4d6..1286f8a41 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -28,7 +28,7 @@ class Backend(NamedTuple): type_: BackendType -MODULES = ["dataframe", "series", "expr"] +MODULES = ["dataframe", "series", "expr", "expr_dt"] BACKENDS = [ Backend(name="arrow", module="_arrow", type_=BackendType.EAGER), From a6ed1d29dbe89153b8d66248755cc107ff6f6b60 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 07:30:57 +0000 Subject: [PATCH 02/10] refactor pandas_like --- narwhals/_pandas_like/expr.py | 194 +-------------- narwhals/_pandas_like/expr_cat.py | 20 ++ narwhals/_pandas_like/expr_dt.py | 105 ++++++++ narwhals/_pandas_like/expr_list.py | 22 ++ narwhals/_pandas_like/expr_name.py | 175 ++++++++++++++ narwhals/_pandas_like/expr_str.py | 116 +++++++++ narwhals/_pandas_like/series.py | 344 +-------------------------- narwhals/_pandas_like/series_cat.py | 17 ++ narwhals/_pandas_like/series_dt.py | 235 ++++++++++++++++++ narwhals/_pandas_like/series_list.py | 47 ++++ narwhals/_pandas_like/series_str.py | 79 ++++++ 11 files changed, 823 insertions(+), 531 deletions(-) create mode 100644 narwhals/_pandas_like/expr_cat.py create mode 100644 narwhals/_pandas_like/expr_dt.py create mode 100644 narwhals/_pandas_like/expr_list.py create mode 100644 narwhals/_pandas_like/expr_name.py create mode 100644 narwhals/_pandas_like/expr_str.py create mode 100644 narwhals/_pandas_like/series_cat.py create mode 100644 narwhals/_pandas_like/series_dt.py create mode 100644 narwhals/_pandas_like/series_list.py create mode 100644 narwhals/_pandas_like/series_str.py diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 0d42861de..3519674d2 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -7,8 +7,10 @@ from typing import Sequence from narwhals._expression_parsing import reuse_series_implementation -from narwhals._expression_parsing import reuse_series_namespace_implementation +from narwhals._pandas_like.expr_cat import PandasLikeExprCatNamespace from narwhals._pandas_like.expr_dt import PandasLikeExprDateTimeNamespace +from narwhals._pandas_like.expr_list import PandasLikeExprListNamespace +from narwhals._pandas_like.expr_name import PandasLikeExprNameNamespace from narwhals._pandas_like.expr_str import PandasLikeExprStringNamespace from narwhals._pandas_like.series import PandasLikeSeries from narwhals._pandas_like.utils import rename @@ -659,193 +661,3 @@ def name(self: Self) -> PandasLikeExprNameNamespace: @property def list(self: Self) -> PandasLikeExprListNamespace: return PandasLikeExprListNamespace(self) - - -class PandasLikeExprCatNamespace: - def __init__(self, expr: PandasLikeExpr) -> None: - self._compliant_expr = expr - - def get_categories(self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "cat", - "get_categories", - ) - - -class PandasLikeExprNameNamespace: - def __init__(self: Self, expr: PandasLikeExpr) -> None: - self._compliant_expr = expr - - def keep(self: Self) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.keep`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), root_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=root_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def map(self: Self, function: Callable[[str], str]) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.map`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [function(str(name)) for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "function": function}, - ) - - def prefix(self: Self, prefix: str) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.prefix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [prefix + str(name) for name in root_names] - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, - ) - - def suffix(self: Self, suffix: str) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.suffix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [str(name) + suffix for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, - ) - - def to_lowercase(self: Self) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_lowercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).lower() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def to_uppercase(self: Self) -> PandasLikeExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_uppercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).upper() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - implementation=self._compliant_expr._implementation, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - -class PandasLikeExprListNamespace: - def __init__(self: Self, expr: PandasLikeExpr) -> None: - self._expr = expr - - def len(self: Self) -> PandasLikeExpr: - return reuse_series_namespace_implementation( - self._expr, - "list", - "len", - ) diff --git a/narwhals/_pandas_like/expr_cat.py b/narwhals/_pandas_like/expr_cat.py new file mode 100644 index 000000000..5dd0c7e1f --- /dev/null +++ b/narwhals/_pandas_like/expr_cat.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeExprCatNamespace: + def __init__(self, expr: PandasLikeExpr) -> None: + self._compliant_expr = expr + + def get_categories(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "cat", + "get_categories", + ) diff --git a/narwhals/_pandas_like/expr_dt.py b/narwhals/_pandas_like/expr_dt.py new file mode 100644 index 000000000..13e94080e --- /dev/null +++ b/narwhals/_pandas_like/expr_dt.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Literal + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeExprDateTimeNamespace: + def __init__(self, expr: PandasLikeExpr) -> None: + self._compliant_expr = expr + + def date(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "date") + + def year(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "year") + + def month(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "month") + + def day(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "day") + + def hour(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "hour") + + def minute(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "minute") + + def second(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "second") + + def millisecond(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "millisecond" + ) + + def microsecond(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "microsecond" + ) + + def nanosecond(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "nanosecond" + ) + + def ordinal_day(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "ordinal_day" + ) + + def weekday(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "weekday" + ) + + def total_minutes(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_minutes" + ) + + def total_seconds(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_seconds" + ) + + def total_milliseconds(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_milliseconds" + ) + + def total_microseconds(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_microseconds" + ) + + def total_nanoseconds(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_nanoseconds" + ) + + def to_string(self, format: str) -> PandasLikeExpr: # noqa: A002 + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "to_string", format=format + ) + + def replace_time_zone(self, time_zone: str | None) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "replace_time_zone", time_zone=time_zone + ) + + def convert_time_zone(self, time_zone: str) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone + ) + + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "timestamp", time_unit=time_unit + ) diff --git a/narwhals/_pandas_like/expr_list.py b/narwhals/_pandas_like/expr_list.py new file mode 100644 index 000000000..865f73a8e --- /dev/null +++ b/narwhals/_pandas_like/expr_list.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeExprListNamespace: + def __init__(self: Self, expr: PandasLikeExpr) -> None: + self._expr = expr + + def len(self: Self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, + "list", + "len", + ) diff --git a/narwhals/_pandas_like/expr_name.py b/narwhals/_pandas_like/expr_name.py new file mode 100644 index 000000000..0050e698b --- /dev/null +++ b/narwhals/_pandas_like/expr_name.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Callable + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeExprNameNamespace: + def __init__(self: Self, expr: PandasLikeExpr) -> None: + self._compliant_expr = expr + + def keep(self: Self) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.keep`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), root_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=root_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def map(self: Self, function: Callable[[str], str]) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.map`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [function(str(name)) for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "function": function}, + ) + + def prefix(self: Self, prefix: str) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.prefix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [prefix + str(name) for name in root_names] + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, + ) + + def suffix(self: Self, suffix: str) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.suffix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [str(name) + suffix for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, + ) + + def to_lowercase(self: Self) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_lowercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).lower() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def to_uppercase(self: Self) -> PandasLikeExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_uppercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).upper() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + implementation=self._compliant_expr._implementation, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) diff --git a/narwhals/_pandas_like/expr_str.py b/narwhals/_pandas_like/expr_str.py new file mode 100644 index 000000000..c0637d857 --- /dev/null +++ b/narwhals/_pandas_like/expr_str.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._pandas_like.expr import PandasLikeExpr + + +class PandasLikeExprStringNamespace: + def __init__(self, expr: PandasLikeExpr) -> None: + self._compliant_expr = expr + + def len_chars( + self, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "len_chars" + ) + + def replace( + self, + pattern: str, + value: str, + *, + literal: bool = False, + n: int = 1, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "replace", + pattern=pattern, + value=value, + literal=literal, + n=n, + ) + + def replace_all( + self, + pattern: str, + value: str, + *, + literal: bool = False, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "replace_all", + pattern=pattern, + value=value, + literal=literal, + ) + + def strip_chars(self, characters: str | None = None) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "strip_chars", + characters=characters, + ) + + def starts_with(self, prefix: str) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "starts_with", + prefix=prefix, + ) + + def ends_with(self, suffix: str) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "ends_with", + suffix=suffix, + ) + + def contains(self, pattern: str, *, literal: bool) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "contains", + pattern=pattern, + literal=literal, + ) + + def slice(self, offset: int, length: int | None = None) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "slice", offset=offset, length=length + ) + + def to_datetime(self: Self, format: str | None) -> PandasLikeExpr: # noqa: A002 + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "to_datetime", + format=format, + ) + + def to_uppercase(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "to_uppercase", + ) + + def to_lowercase(self) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "to_lowercase", + ) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 35ec672e4..139c9a3b0 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -8,17 +8,17 @@ from typing import Sequence from typing import overload +from narwhals._pandas_like.series_cat import PandasLikeSeriesCatNamespace +from narwhals._pandas_like.series_dt import PandasLikeSeriesDateTimeNamespace +from narwhals._pandas_like.series_list import PandasLikeSeriesListNamespace +from narwhals._pandas_like.series_str import PandasLikeSeriesStringNamespace from narwhals._pandas_like.utils import broadcast_align_and_extract_native -from narwhals._pandas_like.utils import calculate_timestamp_date -from narwhals._pandas_like.utils import calculate_timestamp_datetime -from narwhals._pandas_like.utils import int_dtype_mapper from narwhals._pandas_like.utils import narwhals_to_native_dtype from narwhals._pandas_like.utils import native_series_from_iterable from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import rename from narwhals._pandas_like.utils import select_columns_by_name from narwhals._pandas_like.utils import set_index -from narwhals._pandas_like.utils import to_datetime from narwhals.dependencies import is_numpy_scalar from narwhals.exceptions import InvalidOperationError from narwhals.typing import CompliantSeries @@ -1187,339 +1187,3 @@ def cat(self) -> PandasLikeSeriesCatNamespace: @property def list(self) -> PandasLikeSeriesListNamespace: return PandasLikeSeriesListNamespace(self) - - -class PandasLikeSeriesCatNamespace: - def __init__(self, series: PandasLikeSeries) -> None: - self._compliant_series = series - - def get_categories(self) -> PandasLikeSeries: - s = self._compliant_series._native_series - return self._compliant_series._from_native_series( - s.__class__(s.cat.categories, name=s.name) - ) - - -class PandasLikeSeriesStringNamespace: - def __init__(self, series: PandasLikeSeries) -> None: - self._compliant_series = series - - def len_chars(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.len() - ) - - def replace( - self, pattern: str, value: str, *, literal: bool = False, n: int = 1 - ) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.replace( - pat=pattern, repl=value, n=n, regex=not literal - ), - ) - - def replace_all( - self, pattern: str, value: str, *, literal: bool = False - ) -> PandasLikeSeries: - return self.replace(pattern, value, literal=literal, n=-1) - - def strip_chars(self, characters: str | None) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.strip(characters), - ) - - def starts_with(self, prefix: str) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.startswith(prefix), - ) - - def ends_with(self, suffix: str) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.endswith(suffix), - ) - - def contains(self, pattern: str, *, literal: bool = False) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.contains( - pat=pattern, regex=not literal - ) - ) - - def slice(self, offset: int, length: int | None = None) -> PandasLikeSeries: - stop = offset + length if length else None - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.slice(start=offset, stop=stop), - ) - - def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002 - return self._compliant_series._from_native_series( - to_datetime(self._compliant_series._implementation)( - self._compliant_series._native_series, format=format - ) - ) - - def to_uppercase(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.upper(), - ) - - def to_lowercase(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.str.lower(), - ) - - -class PandasLikeSeriesDateTimeNamespace: - def __init__(self, series: PandasLikeSeries) -> None: - self._compliant_series = series - - def date(self) -> PandasLikeSeries: - result = self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.date, - ) - if str(result.dtype).lower() == "object": - msg = ( - "Accessing `date` on the default pandas backend " - "will return a Series of type `object`." - "\nThis differs from polars API and will prevent `.dt` chaining. " - "Please switch to the `pyarrow` backend:" - '\ndf.convert_dtypes(dtype_backend="pyarrow")' - ) - raise NotImplementedError(msg) - return result - - def year(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.year, - ) - - def month(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.month, - ) - - def day(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.day, - ) - - def hour(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.hour, - ) - - def minute(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.minute, - ) - - def second(self) -> PandasLikeSeries: - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.second, - ) - - def millisecond(self) -> PandasLikeSeries: - return self.microsecond() // 1000 - - def microsecond(self) -> PandasLikeSeries: - if self._compliant_series._backend_version < (3, 0, 0) and "pyarrow" in str( - self._compliant_series._native_series.dtype - ): - # crazy workaround for https://github.com/pandas-dev/pandas/issues/59154 - import pyarrow.compute as pc # ignore-banned-import() - - native_series = self._compliant_series._native_series - arr = native_series.array.__arrow_array__() - result_arr = pc.add( - pc.multiply(pc.millisecond(arr), 1000), pc.microsecond(arr) - ) - result = native_series.__class__( - native_series.array.__class__(result_arr), name=native_series.name - ) - return self._compliant_series._from_native_series(result) - - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.microsecond - ) - - def nanosecond(self) -> PandasLikeSeries: - return ( # type: ignore[no-any-return] - self.microsecond() * 1_000 - + self._compliant_series._native_series.dt.nanosecond - ) - - def ordinal_day(self) -> PandasLikeSeries: - ser = self._compliant_series._native_series - year_start = ser.dt.year - result = ( - ser.to_numpy().astype("datetime64[D]") - - (year_start.to_numpy() - 1970).astype("datetime64[Y]") - ).astype("int32") + 1 - dtype = "Int64[pyarrow]" if "pyarrow" in str(ser.dtype) else "int32" - return self._compliant_series._from_native_series( - self._compliant_series._native_series.__class__( - result, dtype=dtype, name=year_start.name - ) - ) - - def weekday(self) -> PandasLikeSeries: - return ( - self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.weekday, - ) - + 1 # Pandas is 0-6 while Polars is 1-7 - ) - - def _get_total_seconds(self) -> Any: - if hasattr(self._compliant_series._native_series.dt, "total_seconds"): - return self._compliant_series._native_series.dt.total_seconds() - else: # pragma: no cover - return ( - self._compliant_series._native_series.dt.days * 86400 - + self._compliant_series._native_series.dt.seconds - + (self._compliant_series._native_series.dt.microseconds / 1e6) - + (self._compliant_series._native_series.dt.nanoseconds / 1e9) - ) - - def total_minutes(self) -> PandasLikeSeries: - s = self._get_total_seconds() - s_sign = ( - 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 - ) # this calculates the sign of each series element - s_abs = s.abs() // 60 - if ~s.isna().any(): - s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) - return self._compliant_series._from_native_series(s_abs * s_sign) - - def total_seconds(self) -> PandasLikeSeries: - s = self._get_total_seconds() - s_sign = ( - 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 - ) # this calculates the sign of each series element - s_abs = s.abs() // 1 - if ~s.isna().any(): - s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) - return self._compliant_series._from_native_series(s_abs * s_sign) - - def total_milliseconds(self) -> PandasLikeSeries: - s = self._get_total_seconds() * 1e3 - s_sign = ( - 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 - ) # this calculates the sign of each series element - s_abs = s.abs() // 1 - if ~s.isna().any(): - s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) - return self._compliant_series._from_native_series(s_abs * s_sign) - - def total_microseconds(self) -> PandasLikeSeries: - s = self._get_total_seconds() * 1e6 - s_sign = ( - 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 - ) # this calculates the sign of each series element - s_abs = s.abs() // 1 - if ~s.isna().any(): - s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) - return self._compliant_series._from_native_series(s_abs * s_sign) - - def total_nanoseconds(self) -> PandasLikeSeries: - s = self._get_total_seconds() * 1e9 - s_sign = ( - 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 - ) # this calculates the sign of each series element - s_abs = s.abs() // 1 - if ~s.isna().any(): - s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) - return self._compliant_series._from_native_series(s_abs * s_sign) - - def to_string(self, format: str) -> PandasLikeSeries: # noqa: A002 - # Polars' parser treats `'%.f'` as pandas does `'.%f'` - # PyArrow interprets `'%S'` as "seconds, plus fractional seconds" - # and doesn't support `%f` - if "pyarrow" not in str(self._compliant_series._native_series.dtype): - format = format.replace("%S%.f", "%S.%f") - else: - format = format.replace("%S.%f", "%S").replace("%S%.f", "%S") - return self._compliant_series._from_native_series( - self._compliant_series._native_series.dt.strftime(format) - ) - - def replace_time_zone(self, time_zone: str | None) -> PandasLikeSeries: - if time_zone is not None: - result = self._compliant_series._native_series.dt.tz_localize( - None - ).dt.tz_localize(time_zone) - else: - result = self._compliant_series._native_series.dt.tz_localize(None) - return self._compliant_series._from_native_series(result) - - def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: - if self._compliant_series.dtype.time_zone is None: # type: ignore[attr-defined] - result = self._compliant_series._native_series.dt.tz_localize( - "UTC" - ).dt.tz_convert(time_zone) - else: - result = self._compliant_series._native_series.dt.tz_convert(time_zone) - return self._compliant_series._from_native_series(result) - - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: - s = self._compliant_series._native_series - dtype = self._compliant_series.dtype - is_pyarrow_dtype = "pyarrow" in str(self._compliant_series._native_series.dtype) - mask_na = s.isna() - dtypes = import_dtypes_module(self._compliant_series._version) - if dtype == dtypes.Date: - # Date is only supported in pandas dtypes if pyarrow-backed - s_cast = s.astype("Int32[pyarrow]") - result = calculate_timestamp_date(s_cast, time_unit) - elif dtype == dtypes.Datetime: - original_time_unit = dtype.time_unit # type: ignore[attr-defined] - if ( - self._compliant_series._implementation is Implementation.PANDAS - and self._compliant_series._backend_version < (2,) - ): # pragma: no cover - s_cast = s.view("Int64[pyarrow]") if is_pyarrow_dtype else s.view("int64") - else: - s_cast = ( - s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") - ) - result = calculate_timestamp_datetime(s_cast, original_time_unit, time_unit) - else: - msg = "Input should be either of Date or Datetime type" - raise TypeError(msg) - result[mask_na] = None - return self._compliant_series._from_native_series(result) - - -class PandasLikeSeriesListNamespace: - def __init__(self, series: PandasLikeSeries) -> None: - self._compliant_series = series - - def len(self: Self) -> PandasLikeSeries: - from narwhals.utils import import_dtypes_module - - native_series = self._compliant_series._native_series - native_result = native_series.list.len() - - if ( - self._compliant_series._implementation is Implementation.PANDAS - and self._compliant_series._backend_version < (3, 0) - ): # pragma: no cover - native_result = set_index( - rename( - native_result, - native_series.name, - implementation=self._compliant_series._implementation, - backend_version=self._compliant_series._backend_version, - ), - index=native_series.index, - implementation=self._compliant_series._implementation, - backend_version=self._compliant_series._backend_version, - ) - dtype = narwhals_to_native_dtype( - dtype=import_dtypes_module(self._compliant_series._version).UInt32(), - starting_dtype=native_result.dtype, - implementation=self._compliant_series._implementation, - backend_version=self._compliant_series._backend_version, - version=self._compliant_series._version, - ) - return self._compliant_series._from_native_series(native_result.astype(dtype)) diff --git a/narwhals/_pandas_like/series_cat.py b/narwhals/_pandas_like/series_cat.py new file mode 100644 index 000000000..13deb36bd --- /dev/null +++ b/narwhals/_pandas_like/series_cat.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesCatNamespace: + def __init__(self, series: PandasLikeSeries) -> None: + self._compliant_series = series + + def get_categories(self) -> PandasLikeSeries: + s = self._compliant_series._native_series + return self._compliant_series._from_native_series( + s.__class__(s.cat.categories, name=s.name) + ) diff --git a/narwhals/_pandas_like/series_dt.py b/narwhals/_pandas_like/series_dt.py new file mode 100644 index 000000000..2bf768203 --- /dev/null +++ b/narwhals/_pandas_like/series_dt.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal + +from narwhals._pandas_like.utils import calculate_timestamp_date +from narwhals._pandas_like.utils import calculate_timestamp_datetime +from narwhals._pandas_like.utils import int_dtype_mapper +from narwhals.utils import Implementation +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesDateTimeNamespace: + def __init__(self, series: PandasLikeSeries) -> None: + self._compliant_series = series + + def date(self) -> PandasLikeSeries: + result = self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.date, + ) + if str(result.dtype).lower() == "object": + msg = ( + "Accessing `date` on the default pandas backend " + "will return a Series of type `object`." + "\nThis differs from polars API and will prevent `.dt` chaining. " + "Please switch to the `pyarrow` backend:" + '\ndf.convert_dtypes(dtype_backend="pyarrow")' + ) + raise NotImplementedError(msg) + return result + + def year(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.year, + ) + + def month(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.month, + ) + + def day(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.day, + ) + + def hour(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.hour, + ) + + def minute(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.minute, + ) + + def second(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.second, + ) + + def millisecond(self) -> PandasLikeSeries: + return self.microsecond() // 1000 + + def microsecond(self) -> PandasLikeSeries: + if self._compliant_series._backend_version < (3, 0, 0) and "pyarrow" in str( + self._compliant_series._native_series.dtype + ): + # crazy workaround for https://github.com/pandas-dev/pandas/issues/59154 + import pyarrow.compute as pc # ignore-banned-import() + + native_series = self._compliant_series._native_series + arr = native_series.array.__arrow_array__() + result_arr = pc.add( + pc.multiply(pc.millisecond(arr), 1000), pc.microsecond(arr) + ) + result = native_series.__class__( + native_series.array.__class__(result_arr), name=native_series.name + ) + return self._compliant_series._from_native_series(result) + + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.microsecond + ) + + def nanosecond(self) -> PandasLikeSeries: + return ( # type: ignore[no-any-return] + self.microsecond() * 1_000 + + self._compliant_series._native_series.dt.nanosecond + ) + + def ordinal_day(self) -> PandasLikeSeries: + ser = self._compliant_series._native_series + year_start = ser.dt.year + result = ( + ser.to_numpy().astype("datetime64[D]") + - (year_start.to_numpy() - 1970).astype("datetime64[Y]") + ).astype("int32") + 1 + dtype = "Int64[pyarrow]" if "pyarrow" in str(ser.dtype) else "int32" + return self._compliant_series._from_native_series( + self._compliant_series._native_series.__class__( + result, dtype=dtype, name=year_start.name + ) + ) + + def weekday(self) -> PandasLikeSeries: + return ( + self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.weekday, + ) + + 1 # Pandas is 0-6 while Polars is 1-7 + ) + + def _get_total_seconds(self) -> Any: + if hasattr(self._compliant_series._native_series.dt, "total_seconds"): + return self._compliant_series._native_series.dt.total_seconds() + else: # pragma: no cover + return ( + self._compliant_series._native_series.dt.days * 86400 + + self._compliant_series._native_series.dt.seconds + + (self._compliant_series._native_series.dt.microseconds / 1e6) + + (self._compliant_series._native_series.dt.nanoseconds / 1e9) + ) + + def total_minutes(self) -> PandasLikeSeries: + s = self._get_total_seconds() + s_sign = ( + 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + ) # this calculates the sign of each series element + s_abs = s.abs() // 60 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self._compliant_series._from_native_series(s_abs * s_sign) + + def total_seconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() + s_sign = ( + 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + ) # this calculates the sign of each series element + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self._compliant_series._from_native_series(s_abs * s_sign) + + def total_milliseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e3 + s_sign = ( + 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + ) # this calculates the sign of each series element + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self._compliant_series._from_native_series(s_abs * s_sign) + + def total_microseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e6 + s_sign = ( + 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + ) # this calculates the sign of each series element + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self._compliant_series._from_native_series(s_abs * s_sign) + + def total_nanoseconds(self) -> PandasLikeSeries: + s = self._get_total_seconds() * 1e9 + s_sign = ( + 2 * (s > 0).astype(int_dtype_mapper(s.dtype)) - 1 + ) # this calculates the sign of each series element + s_abs = s.abs() // 1 + if ~s.isna().any(): + s_abs = s_abs.astype(int_dtype_mapper(s.dtype)) + return self._compliant_series._from_native_series(s_abs * s_sign) + + def to_string(self, format: str) -> PandasLikeSeries: # noqa: A002 + # Polars' parser treats `'%.f'` as pandas does `'.%f'` + # PyArrow interprets `'%S'` as "seconds, plus fractional seconds" + # and doesn't support `%f` + if "pyarrow" not in str(self._compliant_series._native_series.dtype): + format = format.replace("%S%.f", "%S.%f") + else: + format = format.replace("%S.%f", "%S").replace("%S%.f", "%S") + return self._compliant_series._from_native_series( + self._compliant_series._native_series.dt.strftime(format) + ) + + def replace_time_zone(self, time_zone: str | None) -> PandasLikeSeries: + if time_zone is not None: + result = self._compliant_series._native_series.dt.tz_localize( + None + ).dt.tz_localize(time_zone) + else: + result = self._compliant_series._native_series.dt.tz_localize(None) + return self._compliant_series._from_native_series(result) + + def convert_time_zone(self, time_zone: str) -> PandasLikeSeries: + if self._compliant_series.dtype.time_zone is None: # type: ignore[attr-defined] + result = self._compliant_series._native_series.dt.tz_localize( + "UTC" + ).dt.tz_convert(time_zone) + else: + result = self._compliant_series._native_series.dt.tz_convert(time_zone) + return self._compliant_series._from_native_series(result) + + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> PandasLikeSeries: + s = self._compliant_series._native_series + dtype = self._compliant_series.dtype + is_pyarrow_dtype = "pyarrow" in str(self._compliant_series._native_series.dtype) + mask_na = s.isna() + dtypes = import_dtypes_module(self._compliant_series._version) + if dtype == dtypes.Date: + # Date is only supported in pandas dtypes if pyarrow-backed + s_cast = s.astype("Int32[pyarrow]") + result = calculate_timestamp_date(s_cast, time_unit) + elif dtype == dtypes.Datetime: + original_time_unit = dtype.time_unit # type: ignore[attr-defined] + if ( + self._compliant_series._implementation is Implementation.PANDAS + and self._compliant_series._backend_version < (2,) + ): # pragma: no cover + s_cast = s.view("Int64[pyarrow]") if is_pyarrow_dtype else s.view("int64") + else: + s_cast = ( + s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + ) + result = calculate_timestamp_datetime(s_cast, original_time_unit, time_unit) + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + result[mask_na] = None + return self._compliant_series._from_native_series(result) diff --git a/narwhals/_pandas_like/series_list.py b/narwhals/_pandas_like/series_list.py new file mode 100644 index 000000000..1daadab13 --- /dev/null +++ b/narwhals/_pandas_like/series_list.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._pandas_like.utils import narwhals_to_native_dtype +from narwhals._pandas_like.utils import rename +from narwhals._pandas_like.utils import set_index +from narwhals.utils import Implementation +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesListNamespace: + def __init__(self, series: PandasLikeSeries) -> None: + self._compliant_series = series + + def len(self: Self) -> PandasLikeSeries: + native_series = self._compliant_series._native_series + native_result = native_series.list.len() + + if ( + self._compliant_series._implementation is Implementation.PANDAS + and self._compliant_series._backend_version < (3, 0) + ): # pragma: no cover + native_result = set_index( + rename( + native_result, + native_series.name, + implementation=self._compliant_series._implementation, + backend_version=self._compliant_series._backend_version, + ), + index=native_series.index, + implementation=self._compliant_series._implementation, + backend_version=self._compliant_series._backend_version, + ) + dtype = narwhals_to_native_dtype( + dtype=import_dtypes_module(self._compliant_series._version).UInt32(), + starting_dtype=native_result.dtype, + implementation=self._compliant_series._implementation, + backend_version=self._compliant_series._backend_version, + version=self._compliant_series._version, + ) + return self._compliant_series._from_native_series(native_result.astype(dtype)) diff --git a/narwhals/_pandas_like/series_str.py b/narwhals/_pandas_like/series_str.py new file mode 100644 index 000000000..023dff280 --- /dev/null +++ b/narwhals/_pandas_like/series_str.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._pandas_like.utils import to_datetime + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._pandas_like.series import PandasLikeSeries + + +class PandasLikeSeriesStringNamespace: + def __init__(self, series: PandasLikeSeries) -> None: + self._compliant_series = series + + def len_chars(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.len() + ) + + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.replace( + pat=pattern, repl=value, n=n, regex=not literal + ), + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> PandasLikeSeries: + return self.replace(pattern, value, literal=literal, n=-1) + + def strip_chars(self, characters: str | None) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.strip(characters), + ) + + def starts_with(self, prefix: str) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.startswith(prefix), + ) + + def ends_with(self, suffix: str) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.endswith(suffix), + ) + + def contains(self, pattern: str, *, literal: bool = False) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.contains( + pat=pattern, regex=not literal + ) + ) + + def slice(self, offset: int, length: int | None = None) -> PandasLikeSeries: + stop = offset + length if length else None + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.slice(start=offset, stop=stop), + ) + + def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002 + return self._compliant_series._from_native_series( + to_datetime(self._compliant_series._implementation)( + self._compliant_series._native_series, format=format + ) + ) + + def to_uppercase(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.upper(), + ) + + def to_lowercase(self) -> PandasLikeSeries: + return self._compliant_series._from_native_series( + self._compliant_series._native_series.str.lower(), + ) From 28c40df1df62ea5af94a2641553de0edde0784b5 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 08:24:54 +0000 Subject: [PATCH 03/10] refactor arrow --- narwhals/_arrow/expr.py | 358 +-------------------------------- narwhals/_arrow/expr_cat.py | 20 ++ narwhals/_arrow/expr_dt.py | 107 ++++++++++ narwhals/_arrow/expr_list.py | 18 ++ narwhals/_arrow/expr_name.py | 169 ++++++++++++++++ narwhals/_arrow/expr_str.py | 83 ++++++++ narwhals/_arrow/series_cat.py | 0 narwhals/_arrow/series_dt.py | 0 narwhals/_arrow/series_list.py | 0 narwhals/_arrow/series_str.py | 0 10 files changed, 404 insertions(+), 351 deletions(-) create mode 100644 narwhals/_arrow/expr_cat.py create mode 100644 narwhals/_arrow/expr_dt.py create mode 100644 narwhals/_arrow/expr_list.py create mode 100644 narwhals/_arrow/expr_name.py create mode 100644 narwhals/_arrow/expr_str.py create mode 100644 narwhals/_arrow/series_cat.py create mode 100644 narwhals/_arrow/series_dt.py create mode 100644 narwhals/_arrow/series_list.py create mode 100644 narwhals/_arrow/series_str.py diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index df5c95367..d0bc49b08 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -6,11 +6,17 @@ from typing import Literal from typing import Sequence +from narwhals._arrow.expr_cat import ArrowExprCatNamespace +from narwhals._arrow.expr_dt import ArrowExprDateTimeNamespace +from narwhals._arrow.expr_list import ArrowExprListNamespace +from narwhals._arrow.expr_name import ArrowExprNameNamespace +from narwhals._arrow.expr_str import ArrowExprStringNamespace +from narwhals._arrow.series import ArrowSeries from narwhals._expression_parsing import reuse_series_implementation -from narwhals._expression_parsing import reuse_series_namespace_implementation from narwhals.dependencies import get_numpy from narwhals.dependencies import is_numpy_array from narwhals.exceptions import ColumnNotFoundError +from narwhals.typing import CompliantExpr from narwhals.utils import Implementation if TYPE_CHECKING: @@ -22,9 +28,6 @@ from narwhals.dtypes import DType from narwhals.utils import Version -from narwhals._arrow.series import ArrowSeries -from narwhals.typing import CompliantExpr - class ArrowExpr(CompliantExpr[ArrowSeries]): _implementation: Implementation = Implementation.PYARROW @@ -560,350 +563,3 @@ def name(self: Self) -> ArrowExprNameNamespace: @property def list(self: Self) -> ArrowExprListNamespace: return ArrowExprListNamespace(self) - - -class ArrowExprCatNamespace: - def __init__(self: Self, expr: ArrowExpr) -> None: - self._compliant_expr = expr - - def get_categories(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "cat", "get_categories" - ) - - -class ArrowExprDateTimeNamespace: - def __init__(self: Self, expr: ArrowExpr) -> None: - self._compliant_expr = expr - - def to_string(self: Self, format: str) -> ArrowExpr: # noqa: A002 - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "to_string", format=format - ) - - def replace_time_zone(self: Self, time_zone: str | None) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "replace_time_zone", time_zone=time_zone - ) - - def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone - ) - - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "timestamp", time_unit=time_unit - ) - - def date(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "date") - - def year(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "year") - - def month(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "month") - - def day(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "day") - - def hour(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "hour") - - def minute(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "minute") - - def second(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._compliant_expr, "dt", "second") - - def millisecond(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "millisecond" - ) - - def microsecond(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "microsecond" - ) - - def nanosecond(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "nanosecond" - ) - - def ordinal_day(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "ordinal_day" - ) - - def weekday(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "weekday" - ) - - def total_minutes(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_minutes" - ) - - def total_seconds(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_seconds" - ) - - def total_milliseconds(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_milliseconds" - ) - - def total_microseconds(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_microseconds" - ) - - def total_nanoseconds(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "dt", "total_nanoseconds" - ) - - -class ArrowExprStringNamespace: - def __init__(self: Self, expr: ArrowExpr) -> None: - self._compliant_expr = expr - - def len_chars(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "len_chars" - ) - - def replace( - self: Self, pattern: str, value: str, *, literal: bool, n: int - ) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "replace", - pattern=pattern, - value=value, - literal=literal, - n=n, - ) - - def replace_all(self: Self, pattern: str, value: str, *, literal: bool) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "replace_all", - pattern=pattern, - value=value, - literal=literal, - ) - - def strip_chars(self: Self, characters: str | None) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "strip_chars", characters=characters - ) - - def starts_with(self: Self, prefix: str) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "starts_with", prefix=prefix - ) - - def ends_with(self: Self, suffix: str) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "ends_with", suffix=suffix - ) - - def contains(self, pattern: str, *, literal: bool) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "contains", pattern=pattern, literal=literal - ) - - def slice(self: Self, offset: int, length: int | None) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "slice", offset=offset, length=length - ) - - def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002 - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "to_datetime", format=format - ) - - def to_uppercase(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "to_uppercase" - ) - - def to_lowercase(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._compliant_expr, "str", "to_lowercase" - ) - - -class ArrowExprNameNamespace: - def __init__(self: Self, expr: ArrowExpr) -> None: - self._compliant_expr = expr - - def keep(self: Self) -> ArrowExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.keep`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), root_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=root_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def map(self: Self, function: Callable[[str], str]) -> ArrowExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.map`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [function(str(name)) for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "function": function}, - ) - - def prefix(self: Self, prefix: str) -> ArrowExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.prefix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [prefix + str(name) for name in root_names] - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, - ) - - def suffix(self: Self, suffix: str) -> ArrowExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.suffix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [str(name) + suffix for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, - ) - - def to_lowercase(self: Self) -> ArrowExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_lowercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).lower() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def to_uppercase(self: Self) -> ArrowExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_uppercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).upper() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.alias(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - -class ArrowExprListNamespace: - def __init__(self: Self, expr: ArrowExpr) -> None: - self._expr = expr - - def len(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation(self._expr, "list", "len") diff --git a/narwhals/_arrow/expr_cat.py b/narwhals/_arrow/expr_cat.py new file mode 100644 index 000000000..6a26ee97f --- /dev/null +++ b/narwhals/_arrow/expr_cat.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.expr import ArrowExpr + + +class ArrowExprCatNamespace: + def __init__(self: Self, expr: ArrowExpr) -> None: + self._compliant_expr = expr + + def get_categories(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "cat", "get_categories" + ) diff --git a/narwhals/_arrow/expr_dt.py b/narwhals/_arrow/expr_dt.py new file mode 100644 index 000000000..1438eba81 --- /dev/null +++ b/narwhals/_arrow/expr_dt.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Literal + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.expr import ArrowExpr + + +class ArrowExprDateTimeNamespace: + def __init__(self: Self, expr: ArrowExpr) -> None: + self._compliant_expr = expr + + def to_string(self: Self, format: str) -> ArrowExpr: # noqa: A002 + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "to_string", format=format + ) + + def replace_time_zone(self: Self, time_zone: str | None) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "replace_time_zone", time_zone=time_zone + ) + + def convert_time_zone(self: Self, time_zone: str) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "convert_time_zone", time_zone=time_zone + ) + + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "timestamp", time_unit=time_unit + ) + + def date(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "date") + + def year(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "year") + + def month(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "month") + + def day(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "day") + + def hour(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "hour") + + def minute(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "minute") + + def second(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._compliant_expr, "dt", "second") + + def millisecond(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "millisecond" + ) + + def microsecond(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "microsecond" + ) + + def nanosecond(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "nanosecond" + ) + + def ordinal_day(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "ordinal_day" + ) + + def weekday(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "weekday" + ) + + def total_minutes(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_minutes" + ) + + def total_seconds(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_seconds" + ) + + def total_milliseconds(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_milliseconds" + ) + + def total_microseconds(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_microseconds" + ) + + def total_nanoseconds(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "dt", "total_nanoseconds" + ) diff --git a/narwhals/_arrow/expr_list.py b/narwhals/_arrow/expr_list.py new file mode 100644 index 000000000..8e8e4c1f0 --- /dev/null +++ b/narwhals/_arrow/expr_list.py @@ -0,0 +1,18 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.expr import ArrowExpr + + +class ArrowExprListNamespace: + def __init__(self: Self, expr: ArrowExpr) -> None: + self._expr = expr + + def len(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation(self._expr, "list", "len") diff --git a/narwhals/_arrow/expr_name.py b/narwhals/_arrow/expr_name.py new file mode 100644 index 000000000..7a2fbedef --- /dev/null +++ b/narwhals/_arrow/expr_name.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Callable + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.expr import ArrowExpr + + +class ArrowExprNameNamespace: + def __init__(self: Self, expr: ArrowExpr) -> None: + self._compliant_expr = expr + + def keep(self: Self) -> ArrowExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.keep`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), root_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=root_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def map(self: Self, function: Callable[[str], str]) -> ArrowExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.map`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [function(str(name)) for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "function": function}, + ) + + def prefix(self: Self, prefix: str) -> ArrowExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.prefix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [prefix + str(name) for name in root_names] + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, + ) + + def suffix(self: Self, suffix: str) -> ArrowExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.suffix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [str(name) + suffix for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, + ) + + def to_lowercase(self: Self) -> ArrowExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_lowercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).lower() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def to_uppercase(self: Self) -> ArrowExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_uppercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).upper() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.alias(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) diff --git a/narwhals/_arrow/expr_str.py b/narwhals/_arrow/expr_str.py new file mode 100644 index 000000000..4cc170e34 --- /dev/null +++ b/narwhals/_arrow/expr_str.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import reuse_series_namespace_implementation + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.expr import ArrowExpr + + +class ArrowExprStringNamespace: + def __init__(self: Self, expr: ArrowExpr) -> None: + self._compliant_expr = expr + + def len_chars(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "len_chars" + ) + + def replace( + self: Self, pattern: str, value: str, *, literal: bool, n: int + ) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "replace", + pattern=pattern, + value=value, + literal=literal, + n=n, + ) + + def replace_all(self: Self, pattern: str, value: str, *, literal: bool) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, + "str", + "replace_all", + pattern=pattern, + value=value, + literal=literal, + ) + + def strip_chars(self: Self, characters: str | None) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "strip_chars", characters=characters + ) + + def starts_with(self: Self, prefix: str) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "starts_with", prefix=prefix + ) + + def ends_with(self: Self, suffix: str) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "ends_with", suffix=suffix + ) + + def contains(self, pattern: str, *, literal: bool) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "contains", pattern=pattern, literal=literal + ) + + def slice(self: Self, offset: int, length: int | None) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "slice", offset=offset, length=length + ) + + def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002 + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "to_datetime", format=format + ) + + def to_uppercase(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "to_uppercase" + ) + + def to_lowercase(self: Self) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._compliant_expr, "str", "to_lowercase" + ) diff --git a/narwhals/_arrow/series_cat.py b/narwhals/_arrow/series_cat.py new file mode 100644 index 000000000..e69de29bb diff --git a/narwhals/_arrow/series_dt.py b/narwhals/_arrow/series_dt.py new file mode 100644 index 000000000..e69de29bb diff --git a/narwhals/_arrow/series_list.py b/narwhals/_arrow/series_list.py new file mode 100644 index 000000000..e69de29bb diff --git a/narwhals/_arrow/series_str.py b/narwhals/_arrow/series_str.py new file mode 100644 index 000000000..e69de29bb From 3d9c1cc7fa7c3f78fe9e0ce1f3ceeba00d8d0680 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 08:28:48 +0000 Subject: [PATCH 04/10] refactor arrow --- narwhals/_arrow/series.py | 422 +-------------------------------- narwhals/_arrow/series_cat.py | 22 ++ narwhals/_arrow/series_dt.py | 301 +++++++++++++++++++++++ narwhals/_arrow/series_list.py | 21 ++ narwhals/_arrow/series_str.py | 109 +++++++++ 5 files changed, 458 insertions(+), 417 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 193fc25a2..656a802ca 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -8,13 +8,17 @@ from typing import Sequence from typing import overload +from narwhals._arrow.series_cat import ArrowSeriesCatNamespace +from narwhals._arrow.series_dt import ArrowSeriesDateTimeNamespace +from narwhals._arrow.series_list import ArrowSeriesListNamespace +from narwhals._arrow.series_str import ArrowSeriesStringNamespace from narwhals._arrow.utils import broadcast_and_extract_native from narwhals._arrow.utils import cast_for_truediv from narwhals._arrow.utils import floordiv_compat from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype from narwhals._arrow.utils import pad_series -from narwhals._arrow.utils import parse_datetime_format +from narwhals.typing import CompliantSeries from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name from narwhals.utils import import_dtypes_module @@ -32,7 +36,6 @@ from narwhals._arrow.namespace import ArrowNamespace from narwhals.dtypes import DType from narwhals.utils import Version -from narwhals.typing import CompliantSeries def maybe_extract_py_scalar(value: Any, return_py_scalar: bool) -> Any: # noqa: FBT001 @@ -1180,418 +1183,3 @@ def str(self: Self) -> ArrowSeriesStringNamespace: @property def list(self: Self) -> ArrowSeriesListNamespace: return ArrowSeriesListNamespace(self) - - -class ArrowSeriesDateTimeNamespace: - def __init__(self: Self, series: ArrowSeries) -> None: - self._compliant_series = series - - def to_string(self: Self, format: str) -> ArrowSeries: # noqa: A002 - import pyarrow.compute as pc - - # PyArrow differs from other libraries in that %S also prints out - # the fractional part of the second...:'( - # https://arrow.apache.org/docs/python/generated/pyarrow.compute.strftime.html - format = format.replace("%S.%f", "%S").replace("%S%.f", "%S") - return self._compliant_series._from_native_series( - pc.strftime(self._compliant_series._native_series, format) - ) - - def replace_time_zone(self: Self, time_zone: str | None) -> ArrowSeries: - import pyarrow.compute as pc - - if time_zone is not None: - result = pc.assume_timezone( - pc.local_timestamp(self._compliant_series._native_series), time_zone - ) - else: - result = pc.local_timestamp(self._compliant_series._native_series) - return self._compliant_series._from_native_series(result) - - def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: - import pyarrow as pa - - if self._compliant_series.dtype.time_zone is None: # type: ignore[attr-defined] - result = self.replace_time_zone("UTC")._native_series.cast( - pa.timestamp(self._compliant_series._native_series.type.unit, time_zone) - ) - else: - result = self._compliant_series._native_series.cast( - pa.timestamp(self._compliant_series._native_series.type.unit, time_zone) - ) - - return self._compliant_series._from_native_series(result) - - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - s = self._compliant_series._native_series - dtype = self._compliant_series.dtype - dtypes = import_dtypes_module(self._compliant_series._version) - if dtype == dtypes.Datetime: - unit = dtype.time_unit # type: ignore[attr-defined] - s_cast = s.cast(pa.int64()) - if unit == "ns": - if time_unit == "ns": - result = s_cast - elif time_unit == "us": - result = floordiv_compat(s_cast, 1_000) - else: - result = floordiv_compat(s_cast, 1_000_000) - elif unit == "us": - if time_unit == "ns": - result = pc.multiply(s_cast, 1_000) - elif time_unit == "us": - result = s_cast - else: - result = floordiv_compat(s_cast, 1_000) - elif unit == "ms": - if time_unit == "ns": - result = pc.multiply(s_cast, 1_000_000) - elif time_unit == "us": - result = pc.multiply(s_cast, 1_000) - else: - result = s_cast - elif unit == "s": - if time_unit == "ns": - result = pc.multiply(s_cast, 1_000_000_000) - elif time_unit == "us": - result = pc.multiply(s_cast, 1_000_000) - else: - result = pc.multiply(s_cast, 1_000) - else: # pragma: no cover - msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals" - raise AssertionError(msg) - elif dtype == dtypes.Date: - time_s = pc.multiply(s.cast(pa.int32()), 86400) - if time_unit == "ns": - result = pc.multiply(time_s, 1_000_000_000) - elif time_unit == "us": - result = pc.multiply(time_s, 1_000_000) - else: - result = pc.multiply(time_s, 1_000) - else: - msg = "Input should be either of Date or Datetime type" - raise TypeError(msg) - return self._compliant_series._from_native_series(result) - - def date(self: Self) -> ArrowSeries: - import pyarrow as pa - - return self._compliant_series._from_native_series( - self._compliant_series._native_series.cast(pa.date32()) - ) - - def year(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.year(self._compliant_series._native_series) - ) - - def month(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.month(self._compliant_series._native_series) - ) - - def day(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.day(self._compliant_series._native_series) - ) - - def hour(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.hour(self._compliant_series._native_series) - ) - - def minute(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.minute(self._compliant_series._native_series) - ) - - def second(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.second(self._compliant_series._native_series) - ) - - def millisecond(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.millisecond(self._compliant_series._native_series) - ) - - def microsecond(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - result = pc.add(pc.multiply(pc.millisecond(arr), 1000), pc.microsecond(arr)) - - return self._compliant_series._from_native_series(result) - - def nanosecond(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - result = pc.add( - pc.multiply(self.microsecond()._native_series, 1000), pc.nanosecond(arr) - ) - return self._compliant_series._from_native_series(result) - - def ordinal_day(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.day_of_year(self._compliant_series._native_series) - ) - - def weekday(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.day_of_week(self._compliant_series._native_series, count_from_zero=False) - ) - - def total_minutes(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - unit = arr.type.unit - - unit_to_minutes_factor = { - "s": 60, # seconds - "ms": 60 * 1e3, # milli - "us": 60 * 1e6, # micro - "ns": 60 * 1e9, # nano - } - - factor = pa.scalar(unit_to_minutes_factor[unit], type=pa.int64()) - return self._compliant_series._from_native_series( - pc.cast(pc.divide(arr, factor), pa.int64()) - ) - - def total_seconds(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - unit = arr.type.unit - - unit_to_seconds_factor = { - "s": 1, # seconds - "ms": 1e3, # milli - "us": 1e6, # micro - "ns": 1e9, # nano - } - factor = pa.scalar(unit_to_seconds_factor[unit], type=pa.int64()) - - return self._compliant_series._from_native_series( - pc.cast(pc.divide(arr, factor), pa.int64()) - ) - - def total_milliseconds(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - unit = arr.type.unit - - unit_to_milli_factor = { - "s": 1e3, # seconds - "ms": 1, # milli - "us": 1e3, # micro - "ns": 1e6, # nano - } - - factor = pa.scalar(unit_to_milli_factor[unit], type=pa.int64()) - - if unit == "s": - return self._compliant_series._from_native_series( - pc.cast(pc.multiply(arr, factor), pa.int64()) - ) - - return self._compliant_series._from_native_series( - pc.cast(pc.divide(arr, factor), pa.int64()) - ) - - def total_microseconds(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - unit = arr.type.unit - - unit_to_micro_factor = { - "s": 1e6, # seconds - "ms": 1e3, # milli - "us": 1, # micro - "ns": 1e3, # nano - } - - factor = pa.scalar(unit_to_micro_factor[unit], type=pa.int64()) - - if unit in {"s", "ms"}: - return self._compliant_series._from_native_series( - pc.cast(pc.multiply(arr, factor), pa.int64()) - ) - return self._compliant_series._from_native_series( - pc.cast(pc.divide(arr, factor), pa.int64()) - ) - - def total_nanoseconds(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - arr = self._compliant_series._native_series - unit = arr.type.unit - - unit_to_nano_factor = { - "s": 1e9, # seconds - "ms": 1e6, # milli - "us": 1e3, # micro - "ns": 1, # nano - } - - factor = pa.scalar(unit_to_nano_factor[unit], type=pa.int64()) - - return self._compliant_series._from_native_series( - pc.cast(pc.multiply(arr, factor), pa.int64()) - ) - - -class ArrowSeriesCatNamespace: - def __init__(self: Self, series: ArrowSeries) -> None: - self._compliant_series = series - - def get_categories(self: Self) -> ArrowSeries: - import pyarrow as pa - - ca = self._compliant_series._native_series - out = pa.chunked_array( - [pa.concat_arrays(x.dictionary for x in ca.chunks).unique()] - ) - return self._compliant_series._from_native_series(out) - - -class ArrowSeriesStringNamespace: - def __init__(self: Self, series: ArrowSeries) -> None: - self._compliant_series = series - - def len_chars(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.utf8_length(self._compliant_series._native_series) - ) - - def replace( - self: Self, pattern: str, value: str, *, literal: bool, n: int - ) -> ArrowSeries: - import pyarrow.compute as pc - - method = "replace_substring" if literal else "replace_substring_regex" - return self._compliant_series._from_native_series( - getattr(pc, method)( - self._compliant_series._native_series, - pattern=pattern, - replacement=value, - max_replacements=n, - ) - ) - - def replace_all( - self: Self, pattern: str, value: str, *, literal: bool - ) -> ArrowSeries: - return self.replace(pattern, value, literal=literal, n=-1) - - def strip_chars(self: Self, characters: str | None) -> ArrowSeries: - import pyarrow.compute as pc - - whitespace = " \t\n\r\v\f" - return self._compliant_series._from_native_series( - pc.utf8_trim( - self._compliant_series._native_series, - characters or whitespace, - ) - ) - - def starts_with(self: Self, prefix: str) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.equal(self.slice(0, len(prefix))._native_series, prefix) - ) - - def ends_with(self: Self, suffix: str) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.equal(self.slice(-len(suffix), None)._native_series, suffix) - ) - - def contains(self: Self, pattern: str, *, literal: bool) -> ArrowSeries: - import pyarrow.compute as pc - - check_func = pc.match_substring if literal else pc.match_substring_regex - return self._compliant_series._from_native_series( - check_func(self._compliant_series._native_series, pattern) - ) - - def slice(self: Self, offset: int, length: int | None) -> ArrowSeries: - import pyarrow.compute as pc - - stop = offset + length if length is not None else None - return self._compliant_series._from_native_series( - pc.utf8_slice_codeunits( - self._compliant_series._native_series, start=offset, stop=stop - ), - ) - - def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002 - import pyarrow.compute as pc - - if format is None: - format = parse_datetime_format(self._compliant_series._native_series) - - return self._compliant_series._from_native_series( - pc.strptime(self._compliant_series._native_series, format=format, unit="us") - ) - - def to_uppercase(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.utf8_upper(self._compliant_series._native_series), - ) - - def to_lowercase(self: Self) -> ArrowSeries: - import pyarrow.compute as pc - - return self._compliant_series._from_native_series( - pc.utf8_lower(self._compliant_series._native_series), - ) - - -class ArrowSeriesListNamespace: - def __init__(self: Self, series: ArrowSeries) -> None: - self._arrow_series = series - - def len(self: Self) -> ArrowSeries: - import pyarrow as pa - import pyarrow.compute as pc - - return self._arrow_series._from_native_series( - pc.cast(pc.list_value_length(self._arrow_series._native_series), pa.uint32()) - ) diff --git a/narwhals/_arrow/series_cat.py b/narwhals/_arrow/series_cat.py index e69de29bb..ea37ec998 100644 --- a/narwhals/_arrow/series_cat.py +++ b/narwhals/_arrow/series_cat.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.series import ArrowSeries + + +class ArrowSeriesCatNamespace: + def __init__(self: Self, series: ArrowSeries) -> None: + self._compliant_series = series + + def get_categories(self: Self) -> ArrowSeries: + import pyarrow as pa + + ca = self._compliant_series._native_series + out = pa.chunked_array( + [pa.concat_arrays(x.dictionary for x in ca.chunks).unique()] + ) + return self._compliant_series._from_native_series(out) diff --git a/narwhals/_arrow/series_dt.py b/narwhals/_arrow/series_dt.py index e69de29bb..697cd473b 100644 --- a/narwhals/_arrow/series_dt.py +++ b/narwhals/_arrow/series_dt.py @@ -0,0 +1,301 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Literal + +from narwhals._arrow.utils import floordiv_compat +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.series import ArrowSeries + + +class ArrowSeriesDateTimeNamespace: + def __init__(self: Self, series: ArrowSeries) -> None: + self._compliant_series = series + + def to_string(self: Self, format: str) -> ArrowSeries: # noqa: A002 + import pyarrow.compute as pc + + # PyArrow differs from other libraries in that %S also prints out + # the fractional part of the second...:'( + # https://arrow.apache.org/docs/python/generated/pyarrow.compute.strftime.html + format = format.replace("%S.%f", "%S").replace("%S%.f", "%S") + return self._compliant_series._from_native_series( + pc.strftime(self._compliant_series._native_series, format) + ) + + def replace_time_zone(self: Self, time_zone: str | None) -> ArrowSeries: + import pyarrow.compute as pc + + if time_zone is not None: + result = pc.assume_timezone( + pc.local_timestamp(self._compliant_series._native_series), time_zone + ) + else: + result = pc.local_timestamp(self._compliant_series._native_series) + return self._compliant_series._from_native_series(result) + + def convert_time_zone(self: Self, time_zone: str) -> ArrowSeries: + import pyarrow as pa + + if self._compliant_series.dtype.time_zone is None: # type: ignore[attr-defined] + result = self.replace_time_zone("UTC")._native_series.cast( + pa.timestamp(self._compliant_series._native_series.type.unit, time_zone) + ) + else: + result = self._compliant_series._native_series.cast( + pa.timestamp(self._compliant_series._native_series.type.unit, time_zone) + ) + + return self._compliant_series._from_native_series(result) + + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + s = self._compliant_series._native_series + dtype = self._compliant_series.dtype + dtypes = import_dtypes_module(self._compliant_series._version) + if dtype == dtypes.Datetime: + unit = dtype.time_unit # type: ignore[attr-defined] + s_cast = s.cast(pa.int64()) + if unit == "ns": + if time_unit == "ns": + result = s_cast + elif time_unit == "us": + result = floordiv_compat(s_cast, 1_000) + else: + result = floordiv_compat(s_cast, 1_000_000) + elif unit == "us": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000) + elif time_unit == "us": + result = s_cast + else: + result = floordiv_compat(s_cast, 1_000) + elif unit == "ms": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000) + elif time_unit == "us": + result = pc.multiply(s_cast, 1_000) + else: + result = s_cast + elif unit == "s": + if time_unit == "ns": + result = pc.multiply(s_cast, 1_000_000_000) + elif time_unit == "us": + result = pc.multiply(s_cast, 1_000_000) + else: + result = pc.multiply(s_cast, 1_000) + else: # pragma: no cover + msg = f"unexpected time unit {unit}, please report an issue at https://github.com/narwhals-dev/narwhals" + raise AssertionError(msg) + elif dtype == dtypes.Date: + time_s = pc.multiply(s.cast(pa.int32()), 86400) + if time_unit == "ns": + result = pc.multiply(time_s, 1_000_000_000) + elif time_unit == "us": + result = pc.multiply(time_s, 1_000_000) + else: + result = pc.multiply(time_s, 1_000) + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + return self._compliant_series._from_native_series(result) + + def date(self: Self) -> ArrowSeries: + import pyarrow as pa + + return self._compliant_series._from_native_series( + self._compliant_series._native_series.cast(pa.date32()) + ) + + def year(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.year(self._compliant_series._native_series) + ) + + def month(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.month(self._compliant_series._native_series) + ) + + def day(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.day(self._compliant_series._native_series) + ) + + def hour(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.hour(self._compliant_series._native_series) + ) + + def minute(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.minute(self._compliant_series._native_series) + ) + + def second(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.second(self._compliant_series._native_series) + ) + + def millisecond(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.millisecond(self._compliant_series._native_series) + ) + + def microsecond(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + result = pc.add(pc.multiply(pc.millisecond(arr), 1000), pc.microsecond(arr)) + + return self._compliant_series._from_native_series(result) + + def nanosecond(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + result = pc.add( + pc.multiply(self.microsecond()._native_series, 1000), pc.nanosecond(arr) + ) + return self._compliant_series._from_native_series(result) + + def ordinal_day(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.day_of_year(self._compliant_series._native_series) + ) + + def weekday(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.day_of_week(self._compliant_series._native_series, count_from_zero=False) + ) + + def total_minutes(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + unit = arr.type.unit + + unit_to_minutes_factor = { + "s": 60, # seconds + "ms": 60 * 1e3, # milli + "us": 60 * 1e6, # micro + "ns": 60 * 1e9, # nano + } + + factor = pa.scalar(unit_to_minutes_factor[unit], type=pa.int64()) + return self._compliant_series._from_native_series( + pc.cast(pc.divide(arr, factor), pa.int64()) + ) + + def total_seconds(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + unit = arr.type.unit + + unit_to_seconds_factor = { + "s": 1, # seconds + "ms": 1e3, # milli + "us": 1e6, # micro + "ns": 1e9, # nano + } + factor = pa.scalar(unit_to_seconds_factor[unit], type=pa.int64()) + + return self._compliant_series._from_native_series( + pc.cast(pc.divide(arr, factor), pa.int64()) + ) + + def total_milliseconds(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + unit = arr.type.unit + + unit_to_milli_factor = { + "s": 1e3, # seconds + "ms": 1, # milli + "us": 1e3, # micro + "ns": 1e6, # nano + } + + factor = pa.scalar(unit_to_milli_factor[unit], type=pa.int64()) + + if unit == "s": + return self._compliant_series._from_native_series( + pc.cast(pc.multiply(arr, factor), pa.int64()) + ) + + return self._compliant_series._from_native_series( + pc.cast(pc.divide(arr, factor), pa.int64()) + ) + + def total_microseconds(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + unit = arr.type.unit + + unit_to_micro_factor = { + "s": 1e6, # seconds + "ms": 1e3, # milli + "us": 1, # micro + "ns": 1e3, # nano + } + + factor = pa.scalar(unit_to_micro_factor[unit], type=pa.int64()) + + if unit in {"s", "ms"}: + return self._compliant_series._from_native_series( + pc.cast(pc.multiply(arr, factor), pa.int64()) + ) + return self._compliant_series._from_native_series( + pc.cast(pc.divide(arr, factor), pa.int64()) + ) + + def total_nanoseconds(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + arr = self._compliant_series._native_series + unit = arr.type.unit + + unit_to_nano_factor = { + "s": 1e9, # seconds + "ms": 1e6, # milli + "us": 1e3, # micro + "ns": 1, # nano + } + + factor = pa.scalar(unit_to_nano_factor[unit], type=pa.int64()) + + return self._compliant_series._from_native_series( + pc.cast(pc.multiply(arr, factor), pa.int64()) + ) diff --git a/narwhals/_arrow/series_list.py b/narwhals/_arrow/series_list.py index e69de29bb..be252ae32 100644 --- a/narwhals/_arrow/series_list.py +++ b/narwhals/_arrow/series_list.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.series import ArrowSeries + + +class ArrowSeriesListNamespace: + def __init__(self: Self, series: ArrowSeries) -> None: + self._arrow_series = series + + def len(self: Self) -> ArrowSeries: + import pyarrow as pa + import pyarrow.compute as pc + + return self._arrow_series._from_native_series( + pc.cast(pc.list_value_length(self._arrow_series._native_series), pa.uint32()) + ) diff --git a/narwhals/_arrow/series_str.py b/narwhals/_arrow/series_str.py index e69de29bb..7d44df951 100644 --- a/narwhals/_arrow/series_str.py +++ b/narwhals/_arrow/series_str.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from narwhals._arrow.utils import parse_datetime_format + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._arrow.series import ArrowSeries + + +class ArrowSeriesStringNamespace: + def __init__(self: Self, series: ArrowSeries) -> None: + self._compliant_series = series + + def len_chars(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.utf8_length(self._compliant_series._native_series) + ) + + def replace( + self: Self, pattern: str, value: str, *, literal: bool, n: int + ) -> ArrowSeries: + import pyarrow.compute as pc + + method = "replace_substring" if literal else "replace_substring_regex" + return self._compliant_series._from_native_series( + getattr(pc, method)( + self._compliant_series._native_series, + pattern=pattern, + replacement=value, + max_replacements=n, + ) + ) + + def replace_all( + self: Self, pattern: str, value: str, *, literal: bool + ) -> ArrowSeries: + return self.replace(pattern, value, literal=literal, n=-1) + + def strip_chars(self: Self, characters: str | None) -> ArrowSeries: + import pyarrow.compute as pc + + whitespace = " \t\n\r\v\f" + return self._compliant_series._from_native_series( + pc.utf8_trim( + self._compliant_series._native_series, + characters or whitespace, + ) + ) + + def starts_with(self: Self, prefix: str) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.equal(self.slice(0, len(prefix))._native_series, prefix) + ) + + def ends_with(self: Self, suffix: str) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.equal(self.slice(-len(suffix), None)._native_series, suffix) + ) + + def contains(self: Self, pattern: str, *, literal: bool) -> ArrowSeries: + import pyarrow.compute as pc + + check_func = pc.match_substring if literal else pc.match_substring_regex + return self._compliant_series._from_native_series( + check_func(self._compliant_series._native_series, pattern) + ) + + def slice(self: Self, offset: int, length: int | None) -> ArrowSeries: + import pyarrow.compute as pc + + stop = offset + length if length is not None else None + return self._compliant_series._from_native_series( + pc.utf8_slice_codeunits( + self._compliant_series._native_series, start=offset, stop=stop + ), + ) + + def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002 + import pyarrow.compute as pc + + if format is None: + format = parse_datetime_format(self._compliant_series._native_series) + + return self._compliant_series._from_native_series( + pc.strptime(self._compliant_series._native_series, format=format, unit="us") + ) + + def to_uppercase(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.utf8_upper(self._compliant_series._native_series), + ) + + def to_lowercase(self: Self) -> ArrowSeries: + import pyarrow.compute as pc + + return self._compliant_series._from_native_series( + pc.utf8_lower(self._compliant_series._native_series), + ) From 0d2ced26b1578110ad0308f61b2c583836f5d065 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 08:58:00 +0000 Subject: [PATCH 05/10] refactor dask --- narwhals/_dask/expr.py | 492 +----------------------------------- narwhals/_dask/expr_dt.py | 211 ++++++++++++++++ narwhals/_dask/expr_name.py | 175 +++++++++++++ narwhals/_dask/expr_str.py | 117 +++++++++ 4 files changed, 515 insertions(+), 480 deletions(-) create mode 100644 narwhals/_dask/expr_dt.py create mode 100644 narwhals/_dask/expr_name.py create mode 100644 narwhals/_dask/expr_str.py diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 373c29020..951222c35 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -7,20 +7,20 @@ from typing import NoReturn from typing import Sequence +from narwhals._dask.expr_dt import DaskExprDateTimeNamespace +from narwhals._dask.expr_name import DaskExprNameNamespace +from narwhals._dask.expr_str import DaskExprStringNamespace from narwhals._dask.utils import add_row_index from narwhals._dask.utils import binary_operation_returns_scalar from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype from narwhals._expression_parsing import infer_new_root_output_names -from narwhals._pandas_like.utils import calculate_timestamp_date -from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals.exceptions import ColumnNotFoundError from narwhals.exceptions import InvalidOperationError from narwhals.typing import CompliantExpr from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name -from narwhals.utils import import_dtypes_module if TYPE_CHECKING: try: @@ -722,18 +722,6 @@ def func(df: DaskLazyFrame) -> list[Any]: kwargs={**self._kwargs, "keys": keys}, ) - @property - def str(self: Self) -> DaskExprStringNamespace: - return DaskExprStringNamespace(self) - - @property - def dt(self: Self) -> DaskExprDateTimeNamespace: - return DaskExprDateTimeNamespace(self) - - @property - def name(self: Self) -> DaskExprNameNamespace: - return DaskExprNameNamespace(self) - def cast(self: Self, dtype: DType | type[DType]) -> Self: def func(_input: Any, dtype: DType | type[DType]) -> Any: dtype = narwhals_to_native_dtype(dtype, self._version) @@ -752,470 +740,14 @@ def is_finite(self: Self) -> Self: returns_scalar=self._returns_scalar, ) + @property + def str(self: Self) -> DaskExprStringNamespace: + return DaskExprStringNamespace(self) -class DaskExprStringNamespace: - def __init__(self, expr: DaskExpr) -> None: - self._compliant_expr = expr - - def len_chars(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.str.len(), - "len", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace( - self, pattern: str, value: str, *, literal: bool = False, n: int = 1 - ) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, pattern, value, literal, n: _input.str.replace( - pattern, value, regex=not literal, n=n - ), - "replace", - pattern=pattern, - value=value, - literal=literal, - n=n, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, pattern, value, literal: _input.str.replace( - pattern, value, n=-1, regex=not literal - ), - "replace", - pattern=pattern, - value=value, - literal=literal, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def strip_chars(self, characters: str | None = None) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, characters: _input.str.strip(characters), - "strip", - characters=characters, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def starts_with(self, prefix: str) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, prefix: _input.str.startswith(prefix), - "starts_with", - prefix=prefix, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def ends_with(self, suffix: str) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, suffix: _input.str.endswith(suffix), - "ends_with", - suffix=suffix, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def contains(self, pattern: str, *, literal: bool = False) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, pattern, literal: _input.str.contains( - pat=pattern, regex=not literal - ), - "contains", - pattern=pattern, - literal=literal, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def slice(self, offset: int, length: int | None = None) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, offset, length: _input.str.slice( - start=offset, stop=offset + length if length else None - ), - "slice", - offset=offset, - length=length, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002 - import dask.dataframe as dd - - return self._compliant_expr._from_call( - lambda _input, format: dd.to_datetime(_input, format=format), - "to_datetime", - format=format, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_uppercase(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.str.upper(), - "to_uppercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_lowercase(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.str.lower(), - "to_lowercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - -class DaskExprDateTimeNamespace: - def __init__(self, expr: DaskExpr) -> None: - self._compliant_expr = expr - - def date(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.date, - "date", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def year(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.year, - "year", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def month(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.month, - "month", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def day(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.day, - "day", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def hour(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.hour, - "hour", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def minute(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.minute, - "minute", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def second(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.second, - "second", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def millisecond(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.microsecond // 1000, - "millisecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def microsecond(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.microsecond, - "microsecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def nanosecond(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.microsecond * 1000 + _input.dt.nanosecond, - "nanosecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def ordinal_day(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.dayofyear, - "ordinal_day", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def weekday(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.weekday + 1, # Dask is 0-6 - "weekday", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_string(self, format: str) -> DaskExpr: # noqa: A002 - return self._compliant_expr._from_call( - lambda _input, format: _input.dt.strftime(format.replace("%.f", ".%f")), - "strftime", - format=format, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace_time_zone(self, time_zone: str | None) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input, time_zone: _input.dt.tz_localize(None).dt.tz_localize( - time_zone - ) - if time_zone is not None - else _input.dt.tz_localize(None), - "tz_localize", - time_zone=time_zone, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def convert_time_zone(self, time_zone: str) -> DaskExpr: - def func(s: dx.Series, time_zone: str) -> dx.Series: - dtype = native_to_narwhals_dtype( - s, self._compliant_expr._version, Implementation.DASK - ) - if dtype.time_zone is None: # type: ignore[attr-defined] - return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) - else: - return s.dt.tz_convert(time_zone) - - return self._compliant_expr._from_call( - func, - "tz_convert", - time_zone=time_zone, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: - def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: - dtype = native_to_narwhals_dtype( - s, self._compliant_expr._version, Implementation.DASK - ) - is_pyarrow_dtype = "pyarrow" in str(dtype) - mask_na = s.isna() - dtypes = import_dtypes_module(self._compliant_expr._version) - if dtype == dtypes.Date: - # Date is only supported in pandas dtypes if pyarrow-backed - s_cast = s.astype("Int32[pyarrow]") - result = calculate_timestamp_date(s_cast, time_unit) - elif dtype == dtypes.Datetime: - original_time_unit = dtype.time_unit # type: ignore[attr-defined] - s_cast = ( - s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") - ) - result = calculate_timestamp_datetime( - s_cast, original_time_unit, time_unit - ) - else: - msg = "Input should be either of Date or Datetime type" - raise TypeError(msg) - return result.where(~mask_na) - - return self._compliant_expr._from_call( - func, - "datetime", - time_unit=time_unit, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def total_minutes(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.total_seconds() // 60, - "total_minutes", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def total_seconds(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.total_seconds() // 1, - "total_seconds", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def total_milliseconds(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.total_seconds() * 1000 // 1, - "total_milliseconds", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def total_microseconds(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.total_seconds() * 1_000_000 // 1, - "total_microseconds", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def total_nanoseconds(self) -> DaskExpr: - return self._compliant_expr._from_call( - lambda _input: _input.dt.total_seconds() * 1_000_000_000 // 1, - "total_nanoseconds", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - -class DaskExprNameNamespace: - def __init__(self: Self, expr: DaskExpr) -> None: - self._compliant_expr = expr - - def keep(self: Self) -> DaskExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.keep`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), root_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=root_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def map(self: Self, function: Callable[[str], str]) -> DaskExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.map`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [function(str(name)) for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "function": function}, - ) - - def prefix(self: Self, prefix: str) -> DaskExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.prefix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [prefix + str(name) for name in root_names] - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, - ) - - def suffix(self: Self, suffix: str) -> DaskExpr: - root_names = self._compliant_expr._root_names - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.suffix`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - - output_names = [str(name) + suffix for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, - ) - - def to_lowercase(self: Self) -> DaskExpr: - root_names = self._compliant_expr._root_names - - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_lowercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).lower() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) - - def to_uppercase(self: Self) -> DaskExpr: - root_names = self._compliant_expr._root_names + @property + def dt(self: Self) -> DaskExprDateTimeNamespace: + return DaskExprDateTimeNamespace(self) - if root_names is None: - msg = ( - "Anonymous expressions are not supported in `.name.to_uppercase`.\n" - "Instead of `nw.all()`, try using a named expression, such as " - "`nw.col('a', 'b')`\n" - ) - raise ValueError(msg) - output_names = [str(name).upper() for name in root_names] - - return self._compliant_expr.__class__( - lambda df: [ - series.rename(name) - for series, name in zip(self._compliant_expr._call(df), output_names) - ], - depth=self._compliant_expr._depth, - function_name=self._compliant_expr._function_name, - root_names=root_names, - output_names=output_names, - returns_scalar=self._compliant_expr._returns_scalar, - backend_version=self._compliant_expr._backend_version, - version=self._compliant_expr._version, - kwargs=self._compliant_expr._kwargs, - ) + @property + def name(self: Self) -> DaskExprNameNamespace: + return DaskExprNameNamespace(self) diff --git a/narwhals/_dask/expr_dt.py b/narwhals/_dask/expr_dt.py new file mode 100644 index 000000000..177f2c236 --- /dev/null +++ b/narwhals/_dask/expr_dt.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Literal + +from narwhals._pandas_like.utils import calculate_timestamp_date +from narwhals._pandas_like.utils import calculate_timestamp_datetime +from narwhals._pandas_like.utils import native_to_narwhals_dtype +from narwhals.utils import Implementation +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + + from narwhals._dask.expr import DaskExpr + + +class DaskExprDateTimeNamespace: + def __init__(self, expr: DaskExpr) -> None: + self._compliant_expr = expr + + def date(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.date, + "date", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def year(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.year, + "year", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def month(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.month, + "month", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def day(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.day, + "day", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def hour(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.hour, + "hour", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def minute(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.minute, + "minute", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def second(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.second, + "second", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def millisecond(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.microsecond // 1000, + "millisecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def microsecond(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.microsecond, + "microsecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def nanosecond(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.microsecond * 1000 + _input.dt.nanosecond, + "nanosecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ordinal_day(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.dayofyear, + "ordinal_day", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def weekday(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.weekday + 1, # Dask is 0-6 + "weekday", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_string(self, format: str) -> DaskExpr: # noqa: A002 + return self._compliant_expr._from_call( + lambda _input, format: _input.dt.strftime(format.replace("%.f", ".%f")), + "strftime", + format=format, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_time_zone(self, time_zone: str | None) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, time_zone: _input.dt.tz_localize(None).dt.tz_localize( + time_zone + ) + if time_zone is not None + else _input.dt.tz_localize(None), + "tz_localize", + time_zone=time_zone, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def convert_time_zone(self, time_zone: str) -> DaskExpr: + def func(s: dx.Series, time_zone: str) -> dx.Series: + dtype = native_to_narwhals_dtype( + s, self._compliant_expr._version, Implementation.DASK + ) + if dtype.time_zone is None: # type: ignore[attr-defined] + return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) + else: + return s.dt.tz_convert(time_zone) + + return self._compliant_expr._from_call( + func, + "tz_convert", + time_zone=time_zone, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: + def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: + dtype = native_to_narwhals_dtype( + s, self._compliant_expr._version, Implementation.DASK + ) + is_pyarrow_dtype = "pyarrow" in str(dtype) + mask_na = s.isna() + dtypes = import_dtypes_module(self._compliant_expr._version) + if dtype == dtypes.Date: + # Date is only supported in pandas dtypes if pyarrow-backed + s_cast = s.astype("Int32[pyarrow]") + result = calculate_timestamp_date(s_cast, time_unit) + elif dtype == dtypes.Datetime: + original_time_unit = dtype.time_unit # type: ignore[attr-defined] + s_cast = ( + s.astype("Int64[pyarrow]") if is_pyarrow_dtype else s.astype("int64") + ) + result = calculate_timestamp_datetime( + s_cast, original_time_unit, time_unit + ) + else: + msg = "Input should be either of Date or Datetime type" + raise TypeError(msg) + return result.where(~mask_na) + + return self._compliant_expr._from_call( + func, + "datetime", + time_unit=time_unit, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def total_minutes(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.total_seconds() // 60, + "total_minutes", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def total_seconds(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.total_seconds() // 1, + "total_seconds", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def total_milliseconds(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.total_seconds() * 1000 // 1, + "total_milliseconds", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def total_microseconds(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.total_seconds() * 1_000_000 // 1, + "total_microseconds", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def total_nanoseconds(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.dt.total_seconds() * 1_000_000_000 // 1, + "total_nanoseconds", + returns_scalar=self._compliant_expr._returns_scalar, + ) diff --git a/narwhals/_dask/expr_name.py b/narwhals/_dask/expr_name.py new file mode 100644 index 000000000..3f5a0e4a2 --- /dev/null +++ b/narwhals/_dask/expr_name.py @@ -0,0 +1,175 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Callable + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._dask.expr import DaskExpr + + +class DaskExprNameNamespace: + def __init__(self: Self, expr: DaskExpr) -> None: + self._compliant_expr = expr + + def keep(self: Self) -> DaskExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.keep`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), root_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=root_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def map(self: Self, function: Callable[[str], str]) -> DaskExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.map`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [function(str(name)) for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "function": function}, + ) + + def prefix(self: Self, prefix: str) -> DaskExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.prefix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [prefix + str(name) for name in root_names] + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "prefix": prefix}, + ) + + def suffix(self: Self, suffix: str) -> DaskExpr: + root_names = self._compliant_expr._root_names + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.suffix`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names = [str(name) + suffix for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs={**self._compliant_expr._kwargs, "suffix": suffix}, + ) + + def to_lowercase(self: Self) -> DaskExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_lowercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).lower() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) + + def to_uppercase(self: Self) -> DaskExpr: + root_names = self._compliant_expr._root_names + + if root_names is None: + msg = ( + "Anonymous expressions are not supported in `.name.to_uppercase`.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + output_names = [str(name).upper() for name in root_names] + + return self._compliant_expr.__class__( + lambda df: [ + series.rename(name) + for series, name in zip(self._compliant_expr._call(df), output_names) + ], + depth=self._compliant_expr._depth, + function_name=self._compliant_expr._function_name, + root_names=root_names, + output_names=output_names, + returns_scalar=self._compliant_expr._returns_scalar, + backend_version=self._compliant_expr._backend_version, + version=self._compliant_expr._version, + kwargs=self._compliant_expr._kwargs, + ) diff --git a/narwhals/_dask/expr_str.py b/narwhals/_dask/expr_str.py new file mode 100644 index 000000000..6b8bb892f --- /dev/null +++ b/narwhals/_dask/expr_str.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._dask.expr import DaskExpr + + +class DaskExprStringNamespace: + def __init__(self, expr: DaskExpr) -> None: + self._compliant_expr = expr + + def len_chars(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.str.len(), + "len", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, pattern, value, literal, n: _input.str.replace( + pattern, value, regex=not literal, n=n + ), + "replace", + pattern=pattern, + value=value, + literal=literal, + n=n, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, pattern, value, literal: _input.str.replace( + pattern, value, n=-1, regex=not literal + ), + "replace", + pattern=pattern, + value=value, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self, characters: str | None = None) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, characters: _input.str.strip(characters), + "strip", + characters=characters, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def starts_with(self, prefix: str) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, prefix: _input.str.startswith(prefix), + "starts_with", + prefix=prefix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self, suffix: str) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, suffix: _input.str.endswith(suffix), + "ends_with", + suffix=suffix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self, pattern: str, *, literal: bool = False) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, pattern, literal: _input.str.contains( + pat=pattern, regex=not literal + ), + "contains", + pattern=pattern, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def slice(self, offset: int, length: int | None = None) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input, offset, length: _input.str.slice( + start=offset, stop=offset + length if length else None + ), + "slice", + offset=offset, + length=length, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002 + import dask.dataframe as dd + + return self._compliant_expr._from_call( + lambda _input, format: dd.to_datetime(_input, format=format), + "to_datetime", + format=format, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.str.upper(), + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_lowercase(self) -> DaskExpr: + return self._compliant_expr._from_call( + lambda _input: _input.str.lower(), + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) From 3ccab99395f3da2fe9eb347b02ab3a8424173188 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 09:00:31 +0000 Subject: [PATCH 06/10] refactor duck --- narwhals/_duckdb/expr.py | 231 +---------------------------------- narwhals/_duckdb/expr_dt.py | 95 ++++++++++++++ narwhals/_duckdb/expr_str.py | 148 ++++++++++++++++++++++ 3 files changed, 245 insertions(+), 229 deletions(-) create mode 100644 narwhals/_duckdb/expr_dt.py create mode 100644 narwhals/_duckdb/expr_str.py diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 7285200f8..5a790ff5c 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -4,9 +4,10 @@ from typing import Any from typing import Callable from typing import Literal -from typing import NoReturn from typing import Sequence +from narwhals._duckdb.expr_dt import DuckDBExprDateTimeNamespace +from narwhals._duckdb.expr_str import DuckDBExprStringNamespace from narwhals._duckdb.utils import binary_operation_returns_scalar from narwhals._duckdb.utils import get_column_name from narwhals._duckdb.utils import maybe_evaluate @@ -558,231 +559,3 @@ def str(self: Self) -> DuckDBExprStringNamespace: @property def dt(self: Self) -> DuckDBExprDateTimeNamespace: return DuckDBExprDateTimeNamespace(self) - - -class DuckDBExprStringNamespace: - def __init__(self, expr: DuckDBExpr) -> None: - self._compliant_expr = expr - - def starts_with(self, prefix: str) -> DuckDBExpr: - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression( - "starts_with", _input, ConstantExpression(prefix) - ), - "starts_with", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def ends_with(self, suffix: str) -> DuckDBExpr: - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression( - "ends_with", _input, ConstantExpression(suffix) - ), - "ends_with", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr: - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - def func(_input: duckdb.Expression) -> duckdb.Expression: - if literal: - return FunctionExpression("contains", _input, ConstantExpression(pattern)) - return FunctionExpression( - "regexp_matches", _input, ConstantExpression(pattern) - ) - - return self._compliant_expr._from_call( - func, "contains", returns_scalar=self._compliant_expr._returns_scalar - ) - - def slice(self, offset: int, length: int) -> DuckDBExpr: - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - def func(_input: duckdb.Expression) -> duckdb.Expression: - return FunctionExpression( - "array_slice", - _input, - ConstantExpression(offset + 1) - if offset >= 0 - else FunctionExpression("length", _input) + offset + 1, - FunctionExpression("length", _input) - if length is None - else ConstantExpression(length) + offset, - ) - - return self._compliant_expr._from_call( - func, "slice", returns_scalar=self._compliant_expr._returns_scalar - ) - - def len_chars(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("length", _input), - "len_chars", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_lowercase(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("lower", _input), - "to_lowercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_uppercase(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("upper", _input), - "to_uppercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def strip_chars(self, characters: str | None) -> DuckDBExpr: - import string - - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression( - "trim", - _input, - ConstantExpression( - string.whitespace if characters is None else characters - ), - ), - "strip_chars", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace_all( - self, pattern: str, value: str, *, literal: bool = False - ) -> DuckDBExpr: - from duckdb import ConstantExpression - from duckdb import FunctionExpression - - if literal is False: - return self._compliant_expr._from_call( - lambda _input: FunctionExpression( - "regexp_replace", - _input, - ConstantExpression(pattern), - ConstantExpression(value), - ConstantExpression("g"), - ), - "replace_all", - returns_scalar=self._compliant_expr._returns_scalar, - ) - return self._compliant_expr._from_call( - lambda _input: FunctionExpression( - "replace", _input, ConstantExpression(pattern), ConstantExpression(value) - ), - "replace_all", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace(self, pattern: str, value: str, *, literal: bool, n: int) -> NoReturn: - msg = "`replace` is currently not supported for DuckDB" - raise NotImplementedError(msg) - - -class DuckDBExprDateTimeNamespace: - def __init__(self, expr: DuckDBExpr) -> None: - self._compliant_expr = expr - - def year(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("year", _input), - "year", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def month(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("month", _input), - "month", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def day(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("day", _input), - "day", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def hour(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("hour", _input), - "hour", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def minute(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("minute", _input), - "minute", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def second(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("second", _input), - "second", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def millisecond(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("millisecond", _input) - - FunctionExpression("second", _input) * 1_000, - "millisecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def microsecond(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("microsecond", _input) - - FunctionExpression("second", _input) * 1_000_000, - "microsecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def nanosecond(self) -> DuckDBExpr: - from duckdb import FunctionExpression - - return self._compliant_expr._from_call( - lambda _input: FunctionExpression("nanosecond", _input) - - FunctionExpression("second", _input) * 1_000_000_000, - "nanosecond", - returns_scalar=self._compliant_expr._returns_scalar, - ) diff --git a/narwhals/_duckdb/expr_dt.py b/narwhals/_duckdb/expr_dt.py new file mode 100644 index 000000000..dc05ebd2d --- /dev/null +++ b/narwhals/_duckdb/expr_dt.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from narwhals._duckdb.expr import DuckDBExpr + + +class DuckDBExprDateTimeNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def year(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("year", _input), + "year", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def month(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("month", _input), + "month", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def day(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("day", _input), + "day", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def hour(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("hour", _input), + "hour", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def minute(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("minute", _input), + "minute", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def second(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("second", _input), + "second", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def millisecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("millisecond", _input) + - FunctionExpression("second", _input) * 1_000, + "millisecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def microsecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("microsecond", _input) + - FunctionExpression("second", _input) * 1_000_000, + "microsecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def nanosecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("nanosecond", _input) + - FunctionExpression("second", _input) * 1_000_000_000, + "nanosecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) diff --git a/narwhals/_duckdb/expr_str.py b/narwhals/_duckdb/expr_str.py new file mode 100644 index 000000000..064dc25e3 --- /dev/null +++ b/narwhals/_duckdb/expr_str.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import NoReturn + +if TYPE_CHECKING: + import duckdb + + from narwhals._duckdb.expr import DuckDBExpr + + +class DuckDBExprStringNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def starts_with(self, prefix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "starts_with", _input, ConstantExpression(prefix) + ), + "starts_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self, suffix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "ends_with", _input, ConstantExpression(suffix) + ), + "ends_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + if literal: + return FunctionExpression("contains", _input, ConstantExpression(pattern)) + return FunctionExpression( + "regexp_matches", _input, ConstantExpression(pattern) + ) + + return self._compliant_expr._from_call( + func, "contains", returns_scalar=self._compliant_expr._returns_scalar + ) + + def slice(self, offset: int, length: int) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + return FunctionExpression( + "array_slice", + _input, + ConstantExpression(offset + 1) + if offset >= 0 + else FunctionExpression("length", _input) + offset + 1, + FunctionExpression("length", _input) + if length is None + else ConstantExpression(length) + offset, + ) + + return self._compliant_expr._from_call( + func, "slice", returns_scalar=self._compliant_expr._returns_scalar + ) + + def len_chars(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("length", _input), + "len_chars", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_lowercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("lower", _input), + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("upper", _input), + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self, characters: str | None) -> DuckDBExpr: + import string + + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "trim", + _input, + ConstantExpression( + string.whitespace if characters is None else characters + ), + ), + "strip_chars", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + if literal is False: + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "regexp_replace", + _input, + ConstantExpression(pattern), + ConstantExpression(value), + ConstantExpression("g"), + ), + "replace_all", + returns_scalar=self._compliant_expr._returns_scalar, + ) + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "replace", _input, ConstantExpression(pattern), ConstantExpression(value) + ), + "replace_all", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace(self, pattern: str, value: str, *, literal: bool, n: int) -> NoReturn: + msg = "`replace` is currently not supported for DuckDB" + raise NotImplementedError(msg) From 9d75d179e240c2b1be2fc599b57df41a883b1cf8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 09:03:26 +0000 Subject: [PATCH 07/10] refactor spark --- narwhals/_spark_like/expr.py | 122 +---------------------------- narwhals/_spark_like/expr_str.py | 130 +++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 121 deletions(-) create mode 100644 narwhals/_spark_like/expr_str.py diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index efd3975ff..07b68d26f 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -7,6 +7,7 @@ from typing import Sequence from narwhals._expression_parsing import infer_new_root_output_names +from narwhals._spark_like.expr_str import SparkLikeExprStringNamespace from narwhals._spark_like.utils import get_column_name from narwhals._spark_like.utils import maybe_evaluate from narwhals.typing import CompliantExpr @@ -498,124 +499,3 @@ def is_null(self: Self) -> Self: @property def str(self: Self) -> SparkLikeExprStringNamespace: return SparkLikeExprStringNamespace(self) - - -class SparkLikeExprStringNamespace: - def __init__(self: Self, expr: SparkLikeExpr) -> None: - self._compliant_expr = expr - - def len_chars(self: Self) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - return self._compliant_expr._from_call( - F.char_length, - "len", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def replace_all( - self: Self, pattern: str, value: str, *, literal: bool = False - ) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - def func(_input: Column, pattern: str, value: str, *, literal: bool) -> Column: - replace_all_func = F.replace if literal else F.regexp_replace - return replace_all_func(_input, F.lit(pattern), F.lit(value)) - - return self._compliant_expr._from_call( - func, - "replace", - pattern=pattern, - value=value, - literal=literal, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def strip_chars(self: Self, characters: str | None) -> SparkLikeExpr: - import string - - from pyspark.sql import functions as F # noqa: N812 - - def func(_input: Column, characters: str | None) -> Column: - to_remove = characters if characters is not None else string.whitespace - return F.btrim(_input, F.lit(to_remove)) - - return self._compliant_expr._from_call( - func, - "strip", - characters=characters, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def starts_with(self: Self, prefix: str) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - return self._compliant_expr._from_call( - lambda _input, prefix: F.startswith(_input, F.lit(prefix)), - "starts_with", - prefix=prefix, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def ends_with(self: Self, suffix: str) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - return self._compliant_expr._from_call( - lambda _input, suffix: F.endswith(_input, F.lit(suffix)), - "ends_with", - suffix=suffix, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def contains(self: Self, pattern: str, *, literal: bool) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - def func(_input: Column, pattern: str, *, literal: bool) -> Column: - contains_func = F.contains if literal else F.regexp - return contains_func(_input, F.lit(pattern)) - - return self._compliant_expr._from_call( - func, - "contains", - pattern=pattern, - literal=literal, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def slice(self: Self, offset: int, length: int | None = None) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - # From the docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.substring.html - # The position is not zero based, but 1 based index. - def func(_input: Column, offset: int, length: int | None) -> Column: - col_length = F.char_length(_input) - - _offset = col_length + F.lit(offset + 1) if offset < 0 else F.lit(offset + 1) - _length = F.lit(length) if length is not None else col_length - return _input.substr(_offset, _length) - - return self._compliant_expr._from_call( - func, - "slice", - offset=offset, - length=length, - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_uppercase(self: Self) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - return self._compliant_expr._from_call( - F.upper, - "to_uppercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) - - def to_lowercase(self: Self) -> SparkLikeExpr: - from pyspark.sql import functions as F # noqa: N812 - - return self._compliant_expr._from_call( - F.lower, - "to_lowercase", - returns_scalar=self._compliant_expr._returns_scalar, - ) diff --git a/narwhals/_spark_like/expr_str.py b/narwhals/_spark_like/expr_str.py new file mode 100644 index 000000000..2bb6d300c --- /dev/null +++ b/narwhals/_spark_like/expr_str.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pyspark.sql import Column + from typing_extensions import Self + + from narwhals._spark_like.expr import SparkLikeExpr + + +class SparkLikeExprStringNamespace: + def __init__(self: Self, expr: SparkLikeExpr) -> None: + self._compliant_expr = expr + + def len_chars(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.char_length, + "len", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all( + self: Self, pattern: str, value: str, *, literal: bool = False + ) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, pattern: str, value: str, *, literal: bool) -> Column: + replace_all_func = F.replace if literal else F.regexp_replace + return replace_all_func(_input, F.lit(pattern), F.lit(value)) + + return self._compliant_expr._from_call( + func, + "replace", + pattern=pattern, + value=value, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self: Self, characters: str | None) -> SparkLikeExpr: + import string + + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, characters: str | None) -> Column: + to_remove = characters if characters is not None else string.whitespace + return F.btrim(_input, F.lit(to_remove)) + + return self._compliant_expr._from_call( + func, + "strip", + characters=characters, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def starts_with(self: Self, prefix: str) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + lambda _input, prefix: F.startswith(_input, F.lit(prefix)), + "starts_with", + prefix=prefix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self: Self, suffix: str) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + lambda _input, suffix: F.endswith(_input, F.lit(suffix)), + "ends_with", + suffix=suffix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self: Self, pattern: str, *, literal: bool) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, pattern: str, *, literal: bool) -> Column: + contains_func = F.contains if literal else F.regexp + return contains_func(_input, F.lit(pattern)) + + return self._compliant_expr._from_call( + func, + "contains", + pattern=pattern, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def slice(self: Self, offset: int, length: int | None = None) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + # From the docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.substring.html + # The position is not zero based, but 1 based index. + def func(_input: Column, offset: int, length: int | None) -> Column: + col_length = F.char_length(_input) + + _offset = col_length + F.lit(offset + 1) if offset < 0 else F.lit(offset + 1) + _length = F.lit(length) if length is not None else col_length + return _input.substr(_offset, _length) + + return self._compliant_expr._from_call( + func, + "slice", + offset=offset, + length=length, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.upper, + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_lowercase(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.lower, + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) From 5aa9e670172de300b52bd47a3c7b6914dc173d88 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 09:07:55 +0000 Subject: [PATCH 08/10] move top-level functions from expr to functions --- narwhals/__init__.py | 36 +- narwhals/_expression_parsing.py | 13 + narwhals/expr.py | 1221 +------------------------------ narwhals/functions.py | 1212 ++++++++++++++++++++++++++++++ narwhals/stable/v1/__init__.py | 6 +- 5 files changed, 1247 insertions(+), 1241 deletions(-) diff --git a/narwhals/__init__.py b/narwhals/__init__.py index ac0a8cbe4..fedc35b37 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -34,35 +34,35 @@ from narwhals.dtypes import UInt128 from narwhals.dtypes import Unknown from narwhals.expr import Expr -from narwhals.expr import all_ as all -from narwhals.expr import all_horizontal -from narwhals.expr import any_horizontal -from narwhals.expr import col -from narwhals.expr import concat_str -from narwhals.expr import len_ as len -from narwhals.expr import lit -from narwhals.expr import max -from narwhals.expr import max_horizontal -from narwhals.expr import mean -from narwhals.expr import mean_horizontal -from narwhals.expr import median -from narwhals.expr import min -from narwhals.expr import min_horizontal -from narwhals.expr import nth -from narwhals.expr import sum -from narwhals.expr import sum_horizontal -from narwhals.expr import when +from narwhals.functions import all_ as all +from narwhals.functions import all_horizontal +from narwhals.functions import any_horizontal +from narwhals.functions import col from narwhals.functions import concat +from narwhals.functions import concat_str from narwhals.functions import from_arrow from narwhals.functions import from_dict from narwhals.functions import from_numpy from narwhals.functions import get_level +from narwhals.functions import len_ as len +from narwhals.functions import lit +from narwhals.functions import max +from narwhals.functions import max_horizontal +from narwhals.functions import mean +from narwhals.functions import mean_horizontal +from narwhals.functions import median +from narwhals.functions import min +from narwhals.functions import min_horizontal from narwhals.functions import new_series +from narwhals.functions import nth from narwhals.functions import read_csv from narwhals.functions import read_parquet from narwhals.functions import scan_csv from narwhals.functions import scan_parquet from narwhals.functions import show_versions +from narwhals.functions import sum +from narwhals.functions import sum_horizontal +from narwhals.functions import when from narwhals.schema import Schema from narwhals.series import Series from narwhals.translate import from_native diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 99bb3bb24..ac7e645f2 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -321,3 +321,16 @@ def reduce_output_names(parsed_exprs: Sequence[CompliantExpr[Any]]) -> list[str] if parsed_exprs[0]._output_names is not None else None ) + + +def extract_compliant( + plx: CompliantNamespace[CompliantSeriesT_co], other: Any +) -> CompliantExpr[CompliantSeriesT_co] | CompliantSeriesT_co | Any: + from narwhals.expr import Expr + from narwhals.series import Series + + if isinstance(other, Expr): + return other._to_compliant_expr(plx) + if isinstance(other, Series): + return other._compliant_series + return other diff --git a/narwhals/expr.py b/narwhals/expr.py index ae450289d..84646cf8b 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -8,7 +8,7 @@ from typing import Mapping from typing import Sequence -from narwhals.dependencies import is_numpy_array +from narwhals._expression_parsing import extract_compliant from narwhals.dtypes import _validate_dtype from narwhals.expr_cat import ExprCatNamespace from narwhals.expr_dt import ExprDateTimeNamespace @@ -24,22 +24,9 @@ from narwhals.dtypes import DType from narwhals.typing import CompliantExpr from narwhals.typing import CompliantNamespace - from narwhals.typing import CompliantSeriesT_co from narwhals.typing import IntoExpr -def extract_compliant( - plx: CompliantNamespace[CompliantSeriesT_co], other: Any -) -> CompliantExpr[CompliantSeriesT_co] | CompliantSeriesT_co | Any: - from narwhals.series import Series - - if isinstance(other, Expr): - return other._to_compliant_expr(plx) - if isinstance(other, Series): - return other._compliant_series - return other - - class Expr: def __init__(self, to_compliant_expr: Callable[[Any], Any]) -> None: # callable from CompliantNamespace to CompliantExpr @@ -4235,1212 +4222,6 @@ def list(self: Self) -> ExprListNamespace[Self]: return ExprListNamespace(self) -def col(*names: str | Iterable[str]) -> Expr: - """Creates an expression that references one or more columns by their name(s). - - Arguments: - names: Name(s) of the columns to use. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2], "b": [3, 4]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("a") * nw.col("b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_col`: - - >>> agnostic_col(df_pd) - a - 0 3 - 1 8 - - >>> agnostic_col(df_pl) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - │ 8 │ - └─────┘ - - >>> agnostic_col(df_pa) - pyarrow.Table - a: int64 - ---- - a: [[3,8]] - """ - - def func(plx: Any) -> Any: - return plx.col(*flatten(names)) - - return Expr(func) - - -def nth(*indices: int | Sequence[int]) -> Expr: - """Creates an expression that references one or more columns by their index(es). - - Notes: - `nth` is not supported for Polars version<1.0.0. Please use - [`narwhals.col`][] instead. - - Arguments: - indices: One or more indices representing the columns to retrieve. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2], "b": [3, 4]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.nth(0) * 2).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: - - >>> agnostic_nth(df_pd) - a - 0 2 - 1 4 - - >>> agnostic_nth(df_pl) - shape: (2, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - │ 4 │ - └─────┘ - - >>> agnostic_nth(df_pa) - pyarrow.Table - a: int64 - ---- - a: [[2,4]] - """ - - def func(plx: Any) -> Any: - return plx.nth(*flatten(indices)) - - return Expr(func) - - -# Add underscore so it doesn't conflict with builtin `all` -def all_() -> Expr: - """Instantiate an expression representing all columns. - - Returns: - A new expression. - - Examples: - >>> import polars as pl - >>> import pandas as pd - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.all() * 2).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_all`: - - >>> agnostic_all(df_pd) - a b - 0 2 8 - 1 4 10 - 2 6 12 - - >>> agnostic_all(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 2 ┆ 8 │ - │ 4 ┆ 10 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - - >>> agnostic_all(df_pa) - pyarrow.Table - a: int64 - b: int64 - ---- - a: [[2,4,6]] - b: [[8,10,12]] - """ - return Expr(lambda plx: plx.all()) - - -# Add underscore so it doesn't conflict with builtin `len` -def len_() -> Expr: - """Return the number of rows. - - Returns: - A new expression. - - Examples: - >>> import polars as pl - >>> import pandas as pd - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2], "b": [5, 10]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.len()).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_len`: - - >>> agnostic_len(df_pd) - len - 0 2 - >>> agnostic_len(df_pl) - shape: (1, 1) - ┌─────┐ - │ len │ - │ --- │ - │ u32 │ - ╞═════╡ - │ 2 │ - └─────┘ - >>> agnostic_len(df_pa) - pyarrow.Table - len: int64 - ---- - len: [[2]] - """ - - def func(plx: Any) -> Any: - return plx.len() - - return Expr(func) - - -def sum(*columns: str) -> Expr: - """Sum all values. - - Note: - Syntactic sugar for ``nw.col(columns).sum()`` - - Arguments: - columns: Name(s) of the columns to use in the aggregation function - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.sum("a")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_sum`: - - >>> agnostic_sum(df_pd) - a - 0 3 - - >>> agnostic_sum(df_pl) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 3 │ - └─────┘ - - >>> agnostic_sum(df_pa) - pyarrow.Table - a: int64 - ---- - a: [[3]] - """ - return Expr(lambda plx: plx.col(*columns).sum()) - - -def mean(*columns: str) -> Expr: - """Get the mean value. - - Note: - Syntactic sugar for ``nw.col(columns).mean()`` - - Arguments: - columns: Name(s) of the columns to use in the aggregation function - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 8, 3]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe agnostic function: - - >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.mean("a")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_mean`: - - >>> agnostic_mean(df_pd) - a - 0 4.0 - - >>> agnostic_mean(df_pl) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 4.0 │ - └─────┘ - - >>> agnostic_mean(df_pa) - pyarrow.Table - a: double - ---- - a: [[4]] - """ - return Expr(lambda plx: plx.col(*columns).mean()) - - -def median(*columns: str) -> Expr: - """Get the median value. - - Notes: - - Syntactic sugar for ``nw.col(columns).median()`` - - Results might slightly differ across backends due to differences in the - underlying algorithms used to compute the median. - - Arguments: - columns: Name(s) of the columns to use in the aggregation function - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [4, 5, 2]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe agnostic function: - - >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.median("a")).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_median`: - - >>> agnostic_median(df_pd) - a - 0 4.0 - - >>> agnostic_median(df_pl) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 4.0 │ - └─────┘ - - >>> agnostic_median(df_pa) - pyarrow.Table - a: double - ---- - a: [[4]] - """ - return Expr(lambda plx: plx.col(*columns).median()) - - -def min(*columns: str) -> Expr: - """Return the minimum value. - - Note: - Syntactic sugar for ``nw.col(columns).min()``. - - Arguments: - columns: Name(s) of the columns to use in the aggregation function. - - Returns: - A new expression. - - Examples: - >>> import polars as pl - >>> import pandas as pd - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2], "b": [5, 10]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.min("b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_min`: - - >>> agnostic_min(df_pd) - b - 0 5 - - >>> agnostic_min(df_pl) - shape: (1, 1) - ┌─────┐ - │ b │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 5 │ - └─────┘ - - >>> agnostic_min(df_pa) - pyarrow.Table - b: int64 - ---- - b: [[5]] - """ - return Expr(lambda plx: plx.col(*columns).min()) - - -def max(*columns: str) -> Expr: - """Return the maximum value. - - Note: - Syntactic sugar for ``nw.col(columns).max()``. - - Arguments: - columns: Name(s) of the columns to use in the aggregation function. - - Returns: - A new expression. - - Examples: - >>> import polars as pl - >>> import pandas as pd - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2], "b": [5, 10]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.max("a")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_max`: - - >>> agnostic_max(df_pd) - a - 0 2 - - >>> agnostic_max(df_pl) - shape: (1, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 2 │ - └─────┘ - - >>> agnostic_max(df_pa) - pyarrow.Table - a: int64 - ---- - a: [[2]] - """ - return Expr(lambda plx: plx.col(*columns).max()) - - -def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - """Sum all values horizontally across columns. - - Warning: - Unlike Polars, we support horizontal sum over numeric columns only. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.sum_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: - - >>> agnostic_sum_horizontal(df_pd) - a - 0 6.0 - 1 12.0 - 2 3.0 - - >>> agnostic_sum_horizontal(df_pl) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 6 │ - │ 12 │ - │ 3 │ - └─────┘ - - >>> agnostic_sum_horizontal(df_pa) - pyarrow.Table - a: int64 - ---- - a: [[6,12,3]] - """ - if not exprs: - msg = "At least one expression must be passed to `sum_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.sum_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - """Get the minimum value horizontally across columns. - - Notes: - We support `min_horizontal` over numeric columns only. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [1, 8, 3], - ... "b": [4, 5, None], - ... "c": ["x", "y", "z"], - ... } - - We define a dataframe-agnostic function that computes the horizontal min of "a" - and "b" columns: - - >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.min_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_min_horizontal`: - - >>> agnostic_min_horizontal(pd.DataFrame(data)) - a - 0 1.0 - 1 5.0 - 2 3.0 - - >>> agnostic_min_horizontal(pl.DataFrame(data)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 5 │ - │ 3 │ - └─────┘ - - >>> agnostic_min_horizontal(pa.table(data)) - pyarrow.Table - a: int64 - ---- - a: [[1,5,3]] - """ - if not exprs: - msg = "At least one expression must be passed to `min_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.min_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - """Get the maximum value horizontally across columns. - - Notes: - We support `max_horizontal` over numeric columns only. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [1, 8, 3], - ... "b": [4, 5, None], - ... "c": ["x", "y", "z"], - ... } - - We define a dataframe-agnostic function that computes the horizontal max of "a" - and "b" columns: - - >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.max_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_max_horizontal`: - - >>> agnostic_max_horizontal(pd.DataFrame(data)) - a - 0 4.0 - 1 8.0 - 2 3.0 - - >>> agnostic_max_horizontal(pl.DataFrame(data)) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 4 │ - │ 8 │ - │ 3 │ - └─────┘ - - >>> agnostic_max_horizontal(pa.table(data)) - pyarrow.Table - a: int64 - ---- - a: [[4,8,3]] - """ - if not exprs: - msg = "At least one expression must be passed to `max_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.max_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -class When: - def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None: - self._predicates = flatten([predicates]) - if not self._predicates: - msg = "At least one predicate needs to be provided to `narwhals.when`." - raise TypeError(msg) - - def _extract_predicates(self, plx: Any) -> Any: - return [extract_compliant(plx, v) for v in self._predicates] - - def then(self, value: Any) -> Then: - return Then( - lambda plx: plx.when(*self._extract_predicates(plx)).then( - extract_compliant(plx, value) - ) - ) - - -class Then(Expr): - def otherwise(self, value: Any) -> Expr: - return Expr( - lambda plx: self._to_compliant_expr(plx).otherwise( - extract_compliant(plx, value) - ) - ) - - -def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: - """Start a `when-then-otherwise` expression. - - Expression similar to an `if-else` statement in Python. Always initiated by a - `pl.when().then()`, and optionally followed by - chaining one or more `.when().then()` statements. - Chained when-then operations should be read as Python `if, elif, ... elif` - blocks, not as `if, if, ... if`, i.e. the first condition that evaluates to - `True` will be picked. - If none of the conditions are `True`, an optional - `.otherwise()` can be appended at the end. - If not appended, and none of the conditions are `True`, `None` will be returned. - - Arguments: - predicates: Condition(s) that must be met in order to apply the subsequent - statement. Accepts one or more boolean expressions, which are implicitly - combined with `&`. String input is parsed as a column name. - - Returns: - A "when" object, which `.then` can be called on. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") - ... ).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_when_then_otherwise`: - - >>> agnostic_when_then_otherwise(df_pd) - a b a_when - 0 1 5 5 - 1 2 10 5 - 2 3 15 6 - - >>> agnostic_when_then_otherwise(df_pl) - shape: (3, 3) - ┌─────┬─────┬────────┐ - │ a ┆ b ┆ a_when │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ i32 │ - ╞═════╪═════╪════════╡ - │ 1 ┆ 5 ┆ 5 │ - │ 2 ┆ 10 ┆ 5 │ - │ 3 ┆ 15 ┆ 6 │ - └─────┴─────┴────────┘ - - >>> agnostic_when_then_otherwise(df_pa) - pyarrow.Table - a: int64 - b: int64 - a_when: int64 - ---- - a: [[1,2,3]] - b: [[5,10,15]] - a_when: [[5,5,6]] - """ - return When(*predicates) - - -def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - r"""Compute the bitwise AND horizontally across columns. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [False, False, True, True, False, None], - ... "b": [False, True, True, None, None, None], - ... } - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_all_horizontal`: - - >>> agnostic_all_horizontal(df_pd) - a b all - 0 False False False - 1 False True False - 2 True True True - 3 True - 4 False False - 5 - - >>> agnostic_all_horizontal(df_pl) - shape: (6, 3) - ┌───────┬───────┬───────┐ - │ a ┆ b ┆ all │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ false ┆ false ┆ false │ - │ false ┆ true ┆ false │ - │ true ┆ true ┆ true │ - │ true ┆ null ┆ null │ - │ false ┆ null ┆ false │ - │ null ┆ null ┆ null │ - └───────┴───────┴───────┘ - - >>> agnostic_all_horizontal(df_pa) - pyarrow.Table - a: bool - b: bool - all: bool - ---- - a: [[false,false,true,true,false,null]] - b: [[false,true,true,null,null,null]] - all: [[false,false,true,null,false,null]] - """ - if not exprs: - msg = "At least one expression must be passed to `all_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.all_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: - """Return an expression representing a literal value. - - Arguments: - value: The value to use as literal. - dtype: The data type of the literal value. If not provided, the data type will - be inferred. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [1, 2]} - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns(nw.lit(3)).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_lit`: - - >>> agnostic_lit(df_pd) - a literal - 0 1 3 - 1 2 3 - - >>> agnostic_lit(df_pl) - shape: (2, 2) - ┌─────┬─────────┐ - │ a ┆ literal │ - │ --- ┆ --- │ - │ i64 ┆ i32 │ - ╞═════╪═════════╡ - │ 1 ┆ 3 │ - │ 2 ┆ 3 │ - └─────┴─────────┘ - - >>> agnostic_lit(df_pa) - pyarrow.Table - a: int64 - literal: int64 - ---- - a: [[1,2]] - literal: [[3,3]] - """ - if is_numpy_array(value): - msg = ( - "numpy arrays are not supported as literal values. " - "Consider using `with_columns` to create a new column from the array." - ) - raise ValueError(msg) - - if isinstance(value, (list, tuple)): - msg = f"Nested datatypes are not supported yet. Got {value}" - raise NotImplementedError(msg) - - return Expr(lambda plx: plx.lit(value, dtype)) - - -def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - r"""Compute the bitwise OR horizontally across columns. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [False, False, True, True, False, None], - ... "b": [False, True, True, None, None, None], - ... } - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_any_horizontal`: - - >>> agnostic_any_horizontal(df_pd) - a b any - 0 False False False - 1 False True True - 2 True True True - 3 True True - 4 False - 5 - - >>> agnostic_any_horizontal(df_pl) - shape: (6, 3) - ┌───────┬───────┬───────┐ - │ a ┆ b ┆ any │ - │ --- ┆ --- ┆ --- │ - │ bool ┆ bool ┆ bool │ - ╞═══════╪═══════╪═══════╡ - │ false ┆ false ┆ false │ - │ false ┆ true ┆ true │ - │ true ┆ true ┆ true │ - │ true ┆ null ┆ true │ - │ false ┆ null ┆ null │ - │ null ┆ null ┆ null │ - └───────┴───────┴───────┘ - - >>> agnostic_any_horizontal(df_pa) - pyarrow.Table - a: bool - b: bool - any: bool - ---- - a: [[false,false,true,true,false,null]] - b: [[false,true,true,null,null,null]] - any: [[false,true,true,true,null,null]] - """ - if not exprs: - msg = "At least one expression must be passed to `any_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.any_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - """Compute the mean of all values horizontally across columns. - - Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts - expression input. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [1, 8, 3], - ... "b": [4, 5, None], - ... "c": ["x", "y", "z"], - ... } - >>> df_pl = pl.DataFrame(data) - >>> df_pd = pd.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function that computes the horizontal mean of "a" - and "b" columns: - - >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.mean_horizontal("a", "b")).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow to - `agnostic_mean_horizontal`: - - >>> agnostic_mean_horizontal(df_pd) - a - 0 2.5 - 1 6.5 - 2 3.0 - - >>> agnostic_mean_horizontal(df_pl) - shape: (3, 1) - ┌─────┐ - │ a │ - │ --- │ - │ f64 │ - ╞═════╡ - │ 2.5 │ - │ 6.5 │ - │ 3.0 │ - └─────┘ - - >>> agnostic_mean_horizontal(df_pa) - pyarrow.Table - a: double - ---- - a: [[2.5,6.5,3]] - """ - if not exprs: - msg = "At least one expression must be passed to `mean_horizontal`" - raise ValueError(msg) - return Expr( - lambda plx: plx.mean_horizontal( - *[extract_compliant(plx, v) for v in flatten(exprs)] - ) - ) - - -def concat_str( - exprs: IntoExpr | Iterable[IntoExpr], - *more_exprs: IntoExpr, - separator: str = "", - ignore_nulls: bool = False, -) -> Expr: - r"""Horizontally concatenate columns into a single string column. - - Arguments: - exprs: Columns to concatenate into a single string column. Accepts expression - input. Strings are parsed as column names, other non-expression inputs are - parsed as literals. Non-`String` columns are cast to `String`. - *more_exprs: Additional columns to concatenate into a single string column, - specified as positional arguments. - separator: String that will be used to separate the values of each column. - ignore_nulls: Ignore null values (default is `False`). - If set to `False`, null values will be propagated and if the row contains any - null values, the output is null. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [1, 2, 3], - ... "b": ["dogs", "cats", None], - ... "c": ["play", "swim", "walk"], - ... } - - We define a dataframe-agnostic function that computes the horizontal string - concatenation of different columns - - >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select( - ... nw.concat_str( - ... [ - ... nw.col("a") * 2, - ... nw.col("b"), - ... nw.col("c"), - ... ], - ... separator=" ", - ... ).alias("full_sentence") - ... ).to_native() - - We can pass any supported library such as Pandas, Polars, or PyArrow - to `agnostic_concat_str`: - - >>> agnostic_concat_str(pd.DataFrame(data)) - full_sentence - 0 2 dogs play - 1 4 cats swim - 2 None - - >>> agnostic_concat_str(pl.DataFrame(data)) - shape: (3, 1) - ┌───────────────┐ - │ full_sentence │ - │ --- │ - │ str │ - ╞═══════════════╡ - │ 2 dogs play │ - │ 4 cats swim │ - │ null │ - └───────────────┘ - - >>> agnostic_concat_str(pa.table(data)) - pyarrow.Table - full_sentence: string - ---- - full_sentence: [["2 dogs play","4 cats swim",null]] - """ - return Expr( - lambda plx: plx.concat_str( - [extract_compliant(plx, v) for v in flatten([exprs])], - *[extract_compliant(plx, v) for v in more_exprs], - separator=separator, - ignore_nulls=ignore_nulls, - ) - ) - - __all__ = [ "Expr", ] diff --git a/narwhals/functions.py b/narwhals/functions.py index ed167fb0d..a8d098a39 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -7,15 +7,20 @@ from typing import Iterable from typing import Literal from typing import Protocol +from typing import Sequence from typing import TypeVar from typing import Union +from narwhals._expression_parsing import extract_compliant from narwhals._pandas_like.utils import broadcast_align_and_extract_native from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame +from narwhals.dependencies import is_numpy_array +from narwhals.expr import Expr from narwhals.translate import from_native from narwhals.utils import Implementation from narwhals.utils import Version +from narwhals.utils import flatten from narwhals.utils import parse_version from narwhals.utils import validate_laziness @@ -33,6 +38,7 @@ from narwhals.dtypes import DType from narwhals.schema import Schema from narwhals.series import Series + from narwhals.typing import IntoExpr from narwhals.typing import IntoSeriesT class ArrowStreamExportable(Protocol): @@ -1291,3 +1297,1209 @@ def _scan_parquet_impl( msg = "Unknown namespace is expected to implement `scan_parquet` function." raise AttributeError(msg) from e return from_native(native_frame).lazy() + + +def col(*names: str | Iterable[str]) -> Expr: + """Creates an expression that references one or more columns by their name(s). + + Arguments: + names: Name(s) of the columns to use. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a") * nw.col("b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_col`: + + >>> agnostic_col(df_pd) + a + 0 3 + 1 8 + + >>> agnostic_col(df_pl) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 8 │ + └─────┘ + + >>> agnostic_col(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[3,8]] + """ + + def func(plx: Any) -> Any: + return plx.col(*flatten(names)) + + return Expr(func) + + +def nth(*indices: int | Sequence[int]) -> Expr: + """Creates an expression that references one or more columns by their index(es). + + Notes: + `nth` is not supported for Polars version<1.0.0. Please use + [`narwhals.col`][] instead. + + Arguments: + indices: One or more indices representing the columns to retrieve. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.nth(0) * 2).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: + + >>> agnostic_nth(df_pd) + a + 0 2 + 1 4 + + >>> agnostic_nth(df_pl) + shape: (2, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + │ 4 │ + └─────┘ + + >>> agnostic_nth(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[2,4]] + """ + + def func(plx: Any) -> Any: + return plx.nth(*flatten(indices)) + + return Expr(func) + + +# Add underscore so it doesn't conflict with builtin `all` +def all_() -> Expr: + """Instantiate an expression representing all columns. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import pandas as pd + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.all() * 2).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all`: + + >>> agnostic_all(df_pd) + a b + 0 2 8 + 1 4 10 + 2 6 12 + + >>> agnostic_all(df_pl) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 2 ┆ 8 │ + │ 4 ┆ 10 │ + │ 6 ┆ 12 │ + └─────┴─────┘ + + >>> agnostic_all(df_pa) + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[2,4,6]] + b: [[8,10,12]] + """ + return Expr(lambda plx: plx.all()) + + +# Add underscore so it doesn't conflict with builtin `len` +def len_() -> Expr: + """Return the number of rows. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import pandas as pd + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.len()).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_len`: + + >>> agnostic_len(df_pd) + len + 0 2 + >>> agnostic_len(df_pl) + shape: (1, 1) + ┌─────┐ + │ len │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 2 │ + └─────┘ + >>> agnostic_len(df_pa) + pyarrow.Table + len: int64 + ---- + len: [[2]] + """ + + def func(plx: Any) -> Any: + return plx.len() + + return Expr(func) + + +def sum(*columns: str) -> Expr: + """Sum all values. + + Note: + Syntactic sugar for ``nw.col(columns).sum()`` + + Arguments: + columns: Name(s) of the columns to use in the aggregation function + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.sum("a")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_sum`: + + >>> agnostic_sum(df_pd) + a + 0 3 + + >>> agnostic_sum(df_pl) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + └─────┘ + + >>> agnostic_sum(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[3]] + """ + return Expr(lambda plx: plx.col(*columns).sum()) + + +def mean(*columns: str) -> Expr: + """Get the mean value. + + Note: + Syntactic sugar for ``nw.col(columns).mean()`` + + Arguments: + columns: Name(s) of the columns to use in the aggregation function + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 8, 3]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe agnostic function: + + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.mean("a")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean`: + + >>> agnostic_mean(df_pd) + a + 0 4.0 + + >>> agnostic_mean(df_pl) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + └─────┘ + + >>> agnostic_mean(df_pa) + pyarrow.Table + a: double + ---- + a: [[4]] + """ + return Expr(lambda plx: plx.col(*columns).mean()) + + +def median(*columns: str) -> Expr: + """Get the median value. + + Notes: + - Syntactic sugar for ``nw.col(columns).median()`` + - Results might slightly differ across backends due to differences in the + underlying algorithms used to compute the median. + + Arguments: + columns: Name(s) of the columns to use in the aggregation function + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe agnostic function: + + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.median("a")).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: + + >>> agnostic_median(df_pd) + a + 0 4.0 + + >>> agnostic_median(df_pl) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 4.0 │ + └─────┘ + + >>> agnostic_median(df_pa) + pyarrow.Table + a: double + ---- + a: [[4]] + """ + return Expr(lambda plx: plx.col(*columns).median()) + + +def min(*columns: str) -> Expr: + """Return the minimum value. + + Note: + Syntactic sugar for ``nw.col(columns).min()``. + + Arguments: + columns: Name(s) of the columns to use in the aggregation function. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import pandas as pd + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.min("b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min`: + + >>> agnostic_min(df_pd) + b + 0 5 + + >>> agnostic_min(df_pl) + shape: (1, 1) + ┌─────┐ + │ b │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 5 │ + └─────┘ + + >>> agnostic_min(df_pa) + pyarrow.Table + b: int64 + ---- + b: [[5]] + """ + return Expr(lambda plx: plx.col(*columns).min()) + + +def max(*columns: str) -> Expr: + """Return the maximum value. + + Note: + Syntactic sugar for ``nw.col(columns).max()``. + + Arguments: + columns: Name(s) of the columns to use in the aggregation function. + + Returns: + A new expression. + + Examples: + >>> import polars as pl + >>> import pandas as pd + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.max("a")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max`: + + >>> agnostic_max(df_pd) + a + 0 2 + + >>> agnostic_max(df_pl) + shape: (1, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 2 │ + └─────┘ + + >>> agnostic_max(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[2]] + """ + return Expr(lambda plx: plx.col(*columns).max()) + + +def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """Sum all values horizontally across columns. + + Warning: + Unlike Polars, we support horizontal sum over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.sum_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: + + >>> agnostic_sum_horizontal(df_pd) + a + 0 6.0 + 1 12.0 + 2 3.0 + + >>> agnostic_sum_horizontal(df_pl) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 6 │ + │ 12 │ + │ 3 │ + └─────┘ + + >>> agnostic_sum_horizontal(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[6,12,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `sum_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.sum_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """Get the minimum value horizontally across columns. + + Notes: + We support `min_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal min of "a" + and "b" columns: + + >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.min_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min_horizontal`: + + >>> agnostic_min_horizontal(pd.DataFrame(data)) + a + 0 1.0 + 1 5.0 + 2 3.0 + + >>> agnostic_min_horizontal(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 1 │ + │ 5 │ + │ 3 │ + └─────┘ + + >>> agnostic_min_horizontal(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[1,5,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `min_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.min_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """Get the maximum value horizontally across columns. + + Notes: + We support `max_horizontal` over numeric columns only. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + + We define a dataframe-agnostic function that computes the horizontal max of "a" + and "b" columns: + + >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.max_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max_horizontal`: + + >>> agnostic_max_horizontal(pd.DataFrame(data)) + a + 0 4.0 + 1 8.0 + 2 3.0 + + >>> agnostic_max_horizontal(pl.DataFrame(data)) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 4 │ + │ 8 │ + │ 3 │ + └─────┘ + + >>> agnostic_max_horizontal(pa.table(data)) + pyarrow.Table + a: int64 + ---- + a: [[4,8,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `max_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.max_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +class When: + def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None: + self._predicates = flatten([predicates]) + if not self._predicates: + msg = "At least one predicate needs to be provided to `narwhals.when`." + raise TypeError(msg) + + def _extract_predicates(self, plx: Any) -> Any: + return [extract_compliant(plx, v) for v in self._predicates] + + def then(self, value: Any) -> Then: + return Then( + lambda plx: plx.when(*self._extract_predicates(plx)).then( + extract_compliant(plx, value) + ) + ) + + +class Then(Expr): + def otherwise(self, value: Any) -> Expr: + return Expr( + lambda plx: self._to_compliant_expr(plx).otherwise( + extract_compliant(plx, value) + ) + ) + + +def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: + """Start a `when-then-otherwise` expression. + + Expression similar to an `if-else` statement in Python. Always initiated by a + `pl.when().then()`, and optionally followed by + chaining one or more `.when().then()` statements. + Chained when-then operations should be read as Python `if, elif, ... elif` + blocks, not as `if, if, ... if`, i.e. the first condition that evaluates to + `True` will be picked. + If none of the conditions are `True`, an optional + `.otherwise()` can be appended at the end. + If not appended, and none of the conditions are `True`, `None` will be returned. + + Arguments: + predicates: Condition(s) that must be met in order to apply the subsequent + statement. Accepts one or more boolean expressions, which are implicitly + combined with `&`. String input is parsed as a column name. + + Returns: + A "when" object, which `.then` can be called on. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") + ... ).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_when_then_otherwise`: + + >>> agnostic_when_then_otherwise(df_pd) + a b a_when + 0 1 5 5 + 1 2 10 5 + 2 3 15 6 + + >>> agnostic_when_then_otherwise(df_pl) + shape: (3, 3) + ┌─────┬─────┬────────┐ + │ a ┆ b ┆ a_when │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i32 │ + ╞═════╪═════╪════════╡ + │ 1 ┆ 5 ┆ 5 │ + │ 2 ┆ 10 ┆ 5 │ + │ 3 ┆ 15 ┆ 6 │ + └─────┴─────┴────────┘ + + >>> agnostic_when_then_otherwise(df_pa) + pyarrow.Table + a: int64 + b: int64 + a_when: int64 + ---- + a: [[1,2,3]] + b: [[5,10,15]] + a_when: [[5,5,6]] + """ + return When(*predicates) + + +def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + r"""Compute the bitwise AND horizontally across columns. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [False, False, True, True, False, None], + ... "b": [False, True, True, None, None, None], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all_horizontal`: + + >>> agnostic_all_horizontal(df_pd) + a b all + 0 False False False + 1 False True False + 2 True True True + 3 True + 4 False False + 5 + + >>> agnostic_all_horizontal(df_pl) + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ all │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ false ┆ false ┆ false │ + │ false ┆ true ┆ false │ + │ true ┆ true ┆ true │ + │ true ┆ null ┆ null │ + │ false ┆ null ┆ false │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + + >>> agnostic_all_horizontal(df_pa) + pyarrow.Table + a: bool + b: bool + all: bool + ---- + a: [[false,false,true,true,false,null]] + b: [[false,true,true,null,null,null]] + all: [[false,false,true,null,false,null]] + """ + if not exprs: + msg = "At least one expression must be passed to `all_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.all_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: + """Return an expression representing a literal value. + + Arguments: + value: The value to use as literal. + dtype: The data type of the literal value. If not provided, the data type will + be inferred. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns(nw.lit(3)).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_lit`: + + >>> agnostic_lit(df_pd) + a literal + 0 1 3 + 1 2 3 + + >>> agnostic_lit(df_pl) + shape: (2, 2) + ┌─────┬─────────┐ + │ a ┆ literal │ + │ --- ┆ --- │ + │ i64 ┆ i32 │ + ╞═════╪═════════╡ + │ 1 ┆ 3 │ + │ 2 ┆ 3 │ + └─────┴─────────┘ + + >>> agnostic_lit(df_pa) + pyarrow.Table + a: int64 + literal: int64 + ---- + a: [[1,2]] + literal: [[3,3]] + """ + if is_numpy_array(value): + msg = ( + "numpy arrays are not supported as literal values. " + "Consider using `with_columns` to create a new column from the array." + ) + raise ValueError(msg) + + if isinstance(value, (list, tuple)): + msg = f"Nested datatypes are not supported yet. Got {value}" + raise NotImplementedError(msg) + + return Expr(lambda plx: plx.lit(value, dtype)) + + +def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + r"""Compute the bitwise OR horizontally across columns. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [False, False, True, True, False, None], + ... "b": [False, True, True, None, None, None], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_any_horizontal`: + + >>> agnostic_any_horizontal(df_pd) + a b any + 0 False False False + 1 False True True + 2 True True True + 3 True True + 4 False + 5 + + >>> agnostic_any_horizontal(df_pl) + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ any │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ false ┆ false ┆ false │ + │ false ┆ true ┆ true │ + │ true ┆ true ┆ true │ + │ true ┆ null ┆ true │ + │ false ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + + >>> agnostic_any_horizontal(df_pa) + pyarrow.Table + a: bool + b: bool + any: bool + ---- + a: [[false,false,true,true,false,null]] + b: [[false,true,true,null,null,null]] + any: [[false,true,true,true,null,null]] + """ + if not exprs: + msg = "At least one expression must be passed to `any_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.any_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + """Compute the mean of all values horizontally across columns. + + Arguments: + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [1, 8, 3], + ... "b": [4, 5, None], + ... "c": ["x", "y", "z"], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function that computes the horizontal mean of "a" + and "b" columns: + + >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.mean_horizontal("a", "b")).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean_horizontal`: + + >>> agnostic_mean_horizontal(df_pd) + a + 0 2.5 + 1 6.5 + 2 3.0 + + >>> agnostic_mean_horizontal(df_pl) + shape: (3, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 2.5 │ + │ 6.5 │ + │ 3.0 │ + └─────┘ + + >>> agnostic_mean_horizontal(df_pa) + pyarrow.Table + a: double + ---- + a: [[2.5,6.5,3]] + """ + if not exprs: + msg = "At least one expression must be passed to `mean_horizontal`" + raise ValueError(msg) + return Expr( + lambda plx: plx.mean_horizontal( + *[extract_compliant(plx, v) for v in flatten(exprs)] + ) + ) + + +def concat_str( + exprs: IntoExpr | Iterable[IntoExpr], + *more_exprs: IntoExpr, + separator: str = "", + ignore_nulls: bool = False, +) -> Expr: + r"""Horizontally concatenate columns into a single string column. + + Arguments: + exprs: Columns to concatenate into a single string column. Accepts expression + input. Strings are parsed as column names, other non-expression inputs are + parsed as literals. Non-`String` columns are cast to `String`. + *more_exprs: Additional columns to concatenate into a single string column, + specified as positional arguments. + separator: String that will be used to separate the values of each column. + ignore_nulls: Ignore null values (default is `False`). + If set to `False`, null values will be propagated and if the row contains any + null values, the output is null. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [1, 2, 3], + ... "b": ["dogs", "cats", None], + ... "c": ["play", "swim", "walk"], + ... } + + We define a dataframe-agnostic function that computes the horizontal string + concatenation of different columns + + >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select( + ... nw.concat_str( + ... [ + ... nw.col("a") * 2, + ... nw.col("b"), + ... nw.col("c"), + ... ], + ... separator=" ", + ... ).alias("full_sentence") + ... ).to_native() + + We can pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_concat_str`: + + >>> agnostic_concat_str(pd.DataFrame(data)) + full_sentence + 0 2 dogs play + 1 4 cats swim + 2 None + + >>> agnostic_concat_str(pl.DataFrame(data)) + shape: (3, 1) + ┌───────────────┐ + │ full_sentence │ + │ --- │ + │ str │ + ╞═══════════════╡ + │ 2 dogs play │ + │ 4 cats swim │ + │ null │ + └───────────────┘ + + >>> agnostic_concat_str(pa.table(data)) + pyarrow.Table + full_sentence: string + ---- + full_sentence: [["2 dogs play","4 cats swim",null]] + """ + return Expr( + lambda plx: plx.concat_str( + [extract_compliant(plx, v) for v in flatten([exprs])], + *[extract_compliant(plx, v) for v in more_exprs], + separator=separator, + ignore_nulls=ignore_nulls, + ) + ) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index cb5d2006c..689ab53dd 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -18,9 +18,8 @@ from narwhals.dataframe import DataFrame as NwDataFrame from narwhals.dataframe import LazyFrame as NwLazyFrame from narwhals.expr import Expr as NwExpr -from narwhals.expr import Then as NwThen -from narwhals.expr import When as NwWhen -from narwhals.expr import when as nw_when +from narwhals.functions import Then as NwThen +from narwhals.functions import When as NwWhen from narwhals.functions import _from_dict_impl from narwhals.functions import _from_numpy_impl from narwhals.functions import _new_series_impl @@ -31,6 +30,7 @@ from narwhals.functions import from_arrow as nw_from_arrow from narwhals.functions import get_level from narwhals.functions import show_versions +from narwhals.functions import when as nw_when from narwhals.schema import Schema as NwSchema from narwhals.series import Series as NwSeries from narwhals.stable.v1 import dtypes From 576075fcbaa73fc3678f17b2b0e6abca5d351d8b Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 09:08:44 +0000 Subject: [PATCH 09/10] completeness --- utils/generate_backend_completeness.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 1286f8a41..fccf54f84 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -28,7 +28,20 @@ class Backend(NamedTuple): type_: BackendType -MODULES = ["dataframe", "series", "expr", "expr_dt"] +MODULES = [ + "dataframe", + "series", + "expr", + "expr_dt", + "expr_cat", + "expr_str", + "expr_list", + "expr_name", + "series_dt", + "series_cat", + "series_str", + "series_list", +] BACKENDS = [ Backend(name="arrow", module="_arrow", type_=BackendType.EAGER), From 602e904cd748b7c56694e244408155f56be057b0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 11 Jan 2025 09:14:31 +0000 Subject: [PATCH 10/10] completeness --- mkdocs.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index 9799b56d8..e0930ee6b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -29,8 +29,16 @@ nav: - Supported DataFrame methods: api-completeness/dataframe.md - Supported LazyFrame methods: api-completeness/lazyframe.md - Supported Expr methods: api-completeness/expr.md + - Supported Expr.cat methods: api-completeness/expr_cat.md - Supported Expr.dt methods: api-completeness/expr_dt.md + - Supported Expr.list methods: api-completeness/expr_list.md + - Supported Expr.name methods: api-completeness/expr_name.md + - Supported Expr.str methods: api-completeness/expr_str.md - Supported Series methods: api-completeness/series.md + - Supported Series.cat methods: api-completeness/series_cat.md + - Supported Series.dt methods: api-completeness/series_dt.md + - Supported Series.list methods: api-completeness/series_list.md + - Supported Series.str methods: api-completeness/series_str.md - API Reference: - api-reference/index.md - api-reference/narwhals.md