From 07402c67ce585abaaecb6ab25830a85e23d87efe Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 10 Jan 2025 15:15:46 +0000 Subject: [PATCH 1/2] chore: split namespaces out from expr and series (#1782) --- narwhals/expr.py | 2695 +-------------------------------------- narwhals/expr_cat.py | 66 + narwhals/expr_dt.py | 1415 ++++++++++++++++++++ narwhals/expr_list.py | 77 ++ narwhals/expr_name.py | 298 +++++ narwhals/expr_str.py | 891 +++++++++++++ narwhals/series.py | 2252 +------------------------------- narwhals/series_cat.py | 74 ++ narwhals/series_dt.py | 1280 +++++++++++++++++++ narwhals/series_list.py | 78 ++ narwhals/series_str.py | 866 +++++++++++++ 11 files changed, 5054 insertions(+), 4938 deletions(-) create mode 100644 narwhals/expr_cat.py create mode 100644 narwhals/expr_dt.py create mode 100644 narwhals/expr_list.py create mode 100644 narwhals/expr_name.py create mode 100644 narwhals/expr_str.py create mode 100644 narwhals/series_cat.py create mode 100644 narwhals/series_dt.py create mode 100644 narwhals/series_list.py create mode 100644 narwhals/series_str.py diff --git a/narwhals/expr.py b/narwhals/expr.py index 807a7f04b..ae450289d 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3,15 +3,18 @@ from typing import TYPE_CHECKING from typing import Any from typing import Callable -from typing import Generic from typing import Iterable from typing import Literal from typing import Mapping from typing import Sequence -from typing import TypeVar from narwhals.dependencies import is_numpy_array from narwhals.dtypes import _validate_dtype +from narwhals.expr_cat import ExprCatNamespace +from narwhals.expr_dt import ExprDateTimeNamespace +from narwhals.expr_list import ExprListNamespace +from narwhals.expr_name import ExprNameNamespace +from narwhals.expr_str import ExprStringNamespace from narwhals.utils import _validate_rolling_arguments from narwhals.utils import flatten @@ -4232,2694 +4235,6 @@ def list(self: Self) -> ExprListNamespace[Self]: return ExprListNamespace(self) -ExprT = TypeVar("ExprT", bound=Expr) - - -class ExprCatNamespace(Generic[ExprT]): - def __init__(self: Self, expr: ExprT) -> None: - self._expr = expr - - def get_categories(self: Self) -> ExprT: - """Get unique categories from column. - - Returns: - A new expression. - - Examples: - Let's create some dataframes: - - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"fruits": ["apple", "mango", "mango"]} - >>> df_pd = pd.DataFrame(data, dtype="category") - >>> df_pl = pl.DataFrame(data, schema={"fruits": pl.Categorical}) - - We define a dataframe-agnostic function to get unique categories - from column 'fruits': - - >>> def agnostic_cat_get_categories(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("fruits").cat.get_categories()).to_native() - - We can then pass any supported library such as pandas or Polars to - `agnostic_cat_get_categories`: - - >>> agnostic_cat_get_categories(df_pd) - fruits - 0 apple - 1 mango - - >>> agnostic_cat_get_categories(df_pl) - shape: (2, 1) - ┌────────┐ - │ fruits │ - │ --- │ - │ str │ - ╞════════╡ - │ apple │ - │ mango │ - └────────┘ - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).cat.get_categories() - ) - - -class ExprStringNamespace(Generic[ExprT]): - def __init__(self: Self, expr: ExprT) -> None: - self._expr = expr - - def len_chars(self: Self) -> ExprT: - r"""Return the length of each string as the number of characters. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"words": ["foo", "Café", "345", "東京", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_len_chars(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... words_len=nw.col("words").str.len_chars() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_len_chars`: - - >>> agnostic_str_len_chars(df_pd) - words words_len - 0 foo 3.0 - 1 Café 4.0 - 2 345 3.0 - 3 東京 2.0 - 4 None NaN - - >>> agnostic_str_len_chars(df_pl) - shape: (5, 2) - ┌───────┬───────────┐ - │ words ┆ words_len │ - │ --- ┆ --- │ - │ str ┆ u32 │ - ╞═══════╪═══════════╡ - │ foo ┆ 3 │ - │ Café ┆ 4 │ - │ 345 ┆ 3 │ - │ 東京 ┆ 2 │ - │ null ┆ null │ - └───────┴───────────┘ - - >>> agnostic_str_len_chars(df_pa) - pyarrow.Table - words: string - words_len: int32 - ---- - words: [["foo","Café","345","東京",null]] - words_len: [[3,4,3,2,null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.len_chars() - ) - - def replace( - self, pattern: str, value: str, *, literal: bool = False, n: int = 1 - ) -> ExprT: - r"""Replace first matching regex/literal substring with a new string value. - - Arguments: - pattern: A valid regular expression pattern. - value: String that will replace the matched substring. - literal: Treat `pattern` as a literal string. - n: Number of matches to replace. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"foo": ["123abc", "abc abc123"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_replace(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... df = df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) - ... return df.to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_replace`: - - >>> agnostic_str_replace(df_pd) - foo replaced - 0 123abc 123 - 1 abc abc123 abc123 - - >>> agnostic_str_replace(df_pl) - shape: (2, 2) - ┌────────────┬──────────┐ - │ foo ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════════╪══════════╡ - │ 123abc ┆ 123 │ - │ abc abc123 ┆ abc123 │ - └────────────┴──────────┘ - - >>> agnostic_str_replace(df_pa) - pyarrow.Table - foo: string - replaced: string - ---- - foo: [["123abc","abc abc123"]] - replaced: [["123"," abc123"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.replace( - pattern, value, literal=literal, n=n - ) - ) - - def replace_all( - self: Self, pattern: str, value: str, *, literal: bool = False - ) -> ExprT: - r"""Replace all matching regex/literal substring with a new string value. - - Arguments: - pattern: A valid regular expression pattern. - value: String that will replace the matched substring. - literal: Treat `pattern` as a literal string. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"foo": ["123abc", "abc abc123"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_replace_all(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... df = df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) - ... return df.to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_replace_all`: - - >>> agnostic_str_replace_all(df_pd) - foo replaced - 0 123abc 123 - 1 abc abc123 123 - - >>> agnostic_str_replace_all(df_pl) - shape: (2, 2) - ┌────────────┬──────────┐ - │ foo ┆ replaced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════════╪══════════╡ - │ 123abc ┆ 123 │ - │ abc abc123 ┆ 123 │ - └────────────┴──────────┘ - - >>> agnostic_str_replace_all(df_pa) - pyarrow.Table - foo: string - replaced: string - ---- - foo: [["123abc","abc abc123"]] - replaced: [["123"," 123"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.replace_all( - pattern, value, literal=literal - ) - ) - - def strip_chars(self: Self, characters: str | None = None) -> ExprT: - r"""Remove leading and trailing characters. - - Arguments: - characters: The set of characters to be removed. All combinations of this - set of characters will be stripped from the start and end of the string. - If set to None (default), all leading and trailing whitespace is removed - instead. - - Returns: - A new expression. - - Examples: - >>> from typing import Any - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"fruits": ["apple", "\nmango"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_strip_chars(df_native: IntoFrame) -> dict[str, Any]: - ... df = nw.from_native(df_native) - ... df = df.with_columns(stripped=nw.col("fruits").str.strip_chars()) - ... return df.to_dict(as_series=False) - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_strip_chars`: - - >>> agnostic_str_strip_chars(df_pd) - {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - - >>> agnostic_str_strip_chars(df_pl) - {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - - >>> agnostic_str_strip_chars(df_pa) - {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.strip_chars(characters) - ) - - def starts_with(self: Self, prefix: str) -> ExprT: - r"""Check if string values start with a substring. - - Arguments: - prefix: prefix substring - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"fruits": ["apple", "mango", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_starts_with(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... has_prefix=nw.col("fruits").str.starts_with("app") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_starts_with`: - - >>> agnostic_str_starts_with(df_pd) - fruits has_prefix - 0 apple True - 1 mango False - 2 None None - - >>> agnostic_str_starts_with(df_pl) - shape: (3, 2) - ┌────────┬────────────┐ - │ fruits ┆ has_prefix │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞════════╪════════════╡ - │ apple ┆ true │ - │ mango ┆ false │ - │ null ┆ null │ - └────────┴────────────┘ - - >>> agnostic_str_starts_with(df_pa) - pyarrow.Table - fruits: string - has_prefix: bool - ---- - fruits: [["apple","mango",null]] - has_prefix: [[true,false,null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix) - ) - - def ends_with(self: Self, suffix: str) -> ExprT: - r"""Check if string values end with a substring. - - Arguments: - suffix: suffix substring - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"fruits": ["apple", "mango", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_ends_with(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... has_suffix=nw.col("fruits").str.ends_with("ngo") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_ends_with`: - - >>> agnostic_str_ends_with(df_pd) - fruits has_suffix - 0 apple False - 1 mango True - 2 None None - - >>> agnostic_str_ends_with(df_pl) - shape: (3, 2) - ┌────────┬────────────┐ - │ fruits ┆ has_suffix │ - │ --- ┆ --- │ - │ str ┆ bool │ - ╞════════╪════════════╡ - │ apple ┆ false │ - │ mango ┆ true │ - │ null ┆ null │ - └────────┴────────────┘ - - >>> agnostic_str_ends_with(df_pa) - pyarrow.Table - fruits: string - has_suffix: bool - ---- - fruits: [["apple","mango",null]] - has_suffix: [[false,true,null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix) - ) - - def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: - r"""Check if string contains a substring that matches a pattern. - - Arguments: - pattern: A Character sequence or valid regular expression pattern. - literal: If True, treats the pattern as a literal string. - If False, assumes the pattern is a regular expression. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_contains(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... default_match=nw.col("pets").str.contains("parrot|Dove"), - ... case_insensitive_match=nw.col("pets").str.contains("(?i)parrot|Dove"), - ... literal_match=nw.col("pets").str.contains( - ... "parrot|Dove", literal=True - ... ), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_contains`: - - >>> agnostic_str_contains(df_pd) - pets default_match case_insensitive_match literal_match - 0 cat False False False - 1 dog False False False - 2 rabbit and parrot True True False - 3 dove False True False - 4 None None None None - - >>> agnostic_str_contains(df_pl) - shape: (5, 4) - ┌───────────────────┬───────────────┬────────────────────────┬───────────────┐ - │ pets ┆ default_match ┆ case_insensitive_match ┆ literal_match │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ bool ┆ bool ┆ bool │ - ╞═══════════════════╪═══════════════╪════════════════════════╪═══════════════╡ - │ cat ┆ false ┆ false ┆ false │ - │ dog ┆ false ┆ false ┆ false │ - │ rabbit and parrot ┆ true ┆ true ┆ false │ - │ dove ┆ false ┆ true ┆ false │ - │ null ┆ null ┆ null ┆ null │ - └───────────────────┴───────────────┴────────────────────────┴───────────────┘ - - >>> agnostic_str_contains(df_pa) - pyarrow.Table - pets: string - default_match: bool - case_insensitive_match: bool - literal_match: bool - ---- - pets: [["cat","dog","rabbit and parrot","dove",null]] - default_match: [[false,false,true,false,null]] - case_insensitive_match: [[false,false,true,true,null]] - literal_match: [[false,false,false,false,null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.contains( - pattern, literal=literal - ) - ) - - def slice(self: Self, offset: int, length: int | None = None) -> ExprT: - r"""Create subslices of the string values of an expression. - - Arguments: - offset: Start index. Negative indexing is supported. - length: Length of the slice. If set to `None` (default), the slice is taken to the - end of the string. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"s": ["pear", None, "papaya", "dragonfruit"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_slice(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... s_sliced=nw.col("s").str.slice(4, length=3) - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_slice`: - - >>> agnostic_str_slice(df_pd) # doctest: +NORMALIZE_WHITESPACE - s s_sliced - 0 pear - 1 None None - 2 papaya ya - 3 dragonfruit onf - - >>> agnostic_str_slice(df_pl) - shape: (4, 2) - ┌─────────────┬──────────┐ - │ s ┆ s_sliced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═════════════╪══════════╡ - │ pear ┆ │ - │ null ┆ null │ - │ papaya ┆ ya │ - │ dragonfruit ┆ onf │ - └─────────────┴──────────┘ - - >>> agnostic_str_slice(df_pa) - pyarrow.Table - s: string - s_sliced: string - ---- - s: [["pear",null,"papaya","dragonfruit"]] - s_sliced: [["",null,"ya","onf"]] - - Using negative indexes: - - >>> def agnostic_str_slice_negative(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns(s_sliced=nw.col("s").str.slice(-3)).to_native() - - >>> agnostic_str_slice_negative(df_pd) - s s_sliced - 0 pear ear - 1 None None - 2 papaya aya - 3 dragonfruit uit - - >>> agnostic_str_slice_negative(df_pl) - shape: (4, 2) - ┌─────────────┬──────────┐ - │ s ┆ s_sliced │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═════════════╪══════════╡ - │ pear ┆ ear │ - │ null ┆ null │ - │ papaya ┆ aya │ - │ dragonfruit ┆ uit │ - └─────────────┴──────────┘ - - >>> agnostic_str_slice_negative(df_pa) - pyarrow.Table - s: string - s_sliced: string - ---- - s: [["pear",null,"papaya","dragonfruit"]] - s_sliced: [["ear",null,"aya","uit"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.slice( - offset=offset, length=length - ) - ) - - def head(self: Self, n: int = 5) -> ExprT: - r"""Take the first n elements of each string. - - Arguments: - n: Number of elements to take. Negative indexing is **not** supported. - - Returns: - A new expression. - - Notes: - If the length of the string has fewer than `n` characters, the full string is returned. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_head(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... lyrics_head=nw.col("lyrics").str.head() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_head`: - - >>> agnostic_str_head(df_pd) - lyrics lyrics_head - 0 Atatata Atata - 1 taata taata - 2 taatatata taata - 3 zukkyun zukky - - >>> agnostic_str_head(df_pl) - shape: (4, 2) - ┌───────────┬─────────────┐ - │ lyrics ┆ lyrics_head │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═══════════╪═════════════╡ - │ Atatata ┆ Atata │ - │ taata ┆ taata │ - │ taatatata ┆ taata │ - │ zukkyun ┆ zukky │ - └───────────┴─────────────┘ - - >>> agnostic_str_head(df_pa) - pyarrow.Table - lyrics: string - lyrics_head: string - ---- - lyrics: [["Atatata","taata","taatatata","zukkyun"]] - lyrics_head: [["Atata","taata","taata","zukky"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n) - ) - - def tail(self: Self, n: int = 5) -> ExprT: - r"""Take the last n elements of each string. - - Arguments: - n: Number of elements to take. Negative indexing is **not** supported. - - Returns: - A new expression. - - Notes: - If the length of the string has fewer than `n` characters, the full string is returned. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_tail(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... lyrics_tail=nw.col("lyrics").str.tail() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_tail`: - - >>> agnostic_str_tail(df_pd) - lyrics lyrics_tail - 0 Atatata atata - 1 taata taata - 2 taatatata atata - 3 zukkyun kkyun - - >>> agnostic_str_tail(df_pl) - shape: (4, 2) - ┌───────────┬─────────────┐ - │ lyrics ┆ lyrics_tail │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞═══════════╪═════════════╡ - │ Atatata ┆ atata │ - │ taata ┆ taata │ - │ taatatata ┆ atata │ - │ zukkyun ┆ kkyun │ - └───────────┴─────────────┘ - - >>> agnostic_str_tail(df_pa) - pyarrow.Table - lyrics: string - lyrics_tail: string - ---- - lyrics: [["Atatata","taata","taatatata","zukkyun"]] - lyrics_tail: [["atata","taata","atata","kkyun"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.slice( - offset=-n, length=None - ) - ) - - def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 - """Convert to Datetime dtype. - - Warning: - As different backends auto-infer format in different ways, if `format=None` - there is no guarantee that the result will be equal. - - Arguments: - format: Format to use for conversion. If set to None (default), the format is - inferred from the data. - - Returns: - A new expression. - - Notes: - pandas defaults to nanosecond time unit, Polars to microsecond. - Prior to pandas 2.0, nanoseconds were the only time unit supported - in pandas, with no ability to set any other one. The ability to - set the time unit in pandas, if the version permits, will arrive. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = ["2020-01-01", "2020-01-02"] - >>> df_pd = pd.DataFrame({"a": data}) - >>> df_pl = pl.DataFrame({"a": data}) - >>> df_pa = pa.table({"a": data}) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_to_datetime(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("a").str.to_datetime(format="%Y-%m-%d") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_to_datetime`: - - >>> agnostic_str_to_datetime(df_pd) - a - 0 2020-01-01 - 1 2020-01-02 - - >>> agnostic_str_to_datetime(df_pl) - shape: (2, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ datetime[μs] │ - ╞═════════════════════╡ - │ 2020-01-01 00:00:00 │ - │ 2020-01-02 00:00:00 │ - └─────────────────────┘ - - >>> agnostic_str_to_datetime(df_pa) - pyarrow.Table - a: timestamp[us] - ---- - a: [[2020-01-01 00:00:00.000000,2020-01-02 00:00:00.000000]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.to_datetime(format=format) - ) - - def to_uppercase(self: Self) -> ExprT: - r"""Transform string to uppercase variant. - - Returns: - A new expression. - - Notes: - The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'. - For more info see [the related issue](https://github.com/apache/arrow/issues/34599). - There may be other unicode-edge-case-related variations across implementations. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"fruits": ["apple", "mango", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_to_uppercase(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... upper_col=nw.col("fruits").str.to_uppercase() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_to_uppercase`: - - >>> agnostic_str_to_uppercase(df_pd) - fruits upper_col - 0 apple APPLE - 1 mango MANGO - 2 None None - - >>> agnostic_str_to_uppercase(df_pl) - shape: (3, 2) - ┌────────┬───────────┐ - │ fruits ┆ upper_col │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═══════════╡ - │ apple ┆ APPLE │ - │ mango ┆ MANGO │ - │ null ┆ null │ - └────────┴───────────┘ - - >>> agnostic_str_to_uppercase(df_pa) - pyarrow.Table - fruits: string - upper_col: string - ---- - fruits: [["apple","mango",null]] - upper_col: [["APPLE","MANGO",null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase() - ) - - def to_lowercase(self: Self) -> ExprT: - r"""Transform string to lowercase variant. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"fruits": ["APPLE", "MANGO", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_str_to_lowercase(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... lower_col=nw.col("fruits").str.to_lowercase() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_str_to_lowercase`: - - >>> agnostic_str_to_lowercase(df_pd) - fruits lower_col - 0 APPLE apple - 1 MANGO mango - 2 None None - - >>> agnostic_str_to_lowercase(df_pl) - shape: (3, 2) - ┌────────┬───────────┐ - │ fruits ┆ lower_col │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═══════════╡ - │ APPLE ┆ apple │ - │ MANGO ┆ mango │ - │ null ┆ null │ - └────────┴───────────┘ - - >>> agnostic_str_to_lowercase(df_pa) - pyarrow.Table - fruits: string - lower_col: string - ---- - fruits: [["APPLE","MANGO",null]] - lower_col: [["apple","mango",null]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() - ) - - -class ExprDateTimeNamespace(Generic[ExprT]): - def __init__(self: Self, expr: ExprT) -> None: - self._expr = expr - - def date(self: Self) -> ExprT: - """Extract the date from underlying DateTime representation. - - Returns: - A new expression. - - Raises: - NotImplementedError: If pandas default backend is being used. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)]} - >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a library agnostic function: - - >>> def agnostic_dt_date(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("a").dt.date()).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_date`: - - >>> agnostic_dt_date(df_pd) - a - 0 2012-01-07 - 1 2023-03-10 - - >>> agnostic_dt_date(df_pl) - shape: (2, 1) - ┌────────────┐ - │ a │ - │ --- │ - │ date │ - ╞════════════╡ - │ 2012-01-07 │ - │ 2023-03-10 │ - └────────────┘ - - >>> agnostic_dt_date(df_pa) - pyarrow.Table - a: date32[day] - ---- - a: [[2012-01-07,2023-03-10]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.date() - ) - - def year(self: Self) -> ExprT: - """Extract year from underlying DateTime representation. - - Returns the year number in the calendar date. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 6, 1), - ... datetime(2024, 12, 13), - ... datetime(2065, 1, 1), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_year(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.year().alias("year") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_year`: - - >>> agnostic_dt_year(df_pd) - datetime year - 0 1978-06-01 1978 - 1 2024-12-13 2024 - 2 2065-01-01 2065 - - >>> agnostic_dt_year(df_pl) - shape: (3, 2) - ┌─────────────────────┬──────┐ - │ datetime ┆ year │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i32 │ - ╞═════════════════════╪══════╡ - │ 1978-06-01 00:00:00 ┆ 1978 │ - │ 2024-12-13 00:00:00 ┆ 2024 │ - │ 2065-01-01 00:00:00 ┆ 2065 │ - └─────────────────────┴──────┘ - - >>> agnostic_dt_year(df_pa) - pyarrow.Table - datetime: timestamp[us] - year: int64 - ---- - datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] - year: [[1978,2024,2065]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.year() - ) - - def month(self: Self) -> ExprT: - """Extract month from underlying DateTime representation. - - Returns the month number starting from 1. The return value ranges from 1 to 12. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 6, 1), - ... datetime(2024, 12, 13), - ... datetime(2065, 1, 1), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_month(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.month().alias("month"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_month`: - - >>> agnostic_dt_month(df_pd) - datetime month - 0 1978-06-01 6 - 1 2024-12-13 12 - 2 2065-01-01 1 - - >>> agnostic_dt_month(df_pl) - shape: (3, 2) - ┌─────────────────────┬───────┐ - │ datetime ┆ month │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪═══════╡ - │ 1978-06-01 00:00:00 ┆ 6 │ - │ 2024-12-13 00:00:00 ┆ 12 │ - │ 2065-01-01 00:00:00 ┆ 1 │ - └─────────────────────┴───────┘ - - >>> agnostic_dt_month(df_pa) - pyarrow.Table - datetime: timestamp[us] - month: int64 - ---- - datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] - month: [[6,12,1]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.month() - ) - - def day(self: Self) -> ExprT: - """Extract day from underlying DateTime representation. - - Returns the day of month starting from 1. The return value ranges from 1 to 31. (The last day of month differs by months.) - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 6, 1), - ... datetime(2024, 12, 13), - ... datetime(2065, 1, 1), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_day(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.day().alias("day"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_day`: - - >>> agnostic_dt_day(df_pd) - datetime day - 0 1978-06-01 1 - 1 2024-12-13 13 - 2 2065-01-01 1 - - >>> agnostic_dt_day(df_pl) - shape: (3, 2) - ┌─────────────────────┬─────┐ - │ datetime ┆ day │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪═════╡ - │ 1978-06-01 00:00:00 ┆ 1 │ - │ 2024-12-13 00:00:00 ┆ 13 │ - │ 2065-01-01 00:00:00 ┆ 1 │ - └─────────────────────┴─────┘ - - >>> agnostic_dt_day(df_pa) - pyarrow.Table - datetime: timestamp[us] - day: int64 - ---- - datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] - day: [[1,13,1]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.day() - ) - - def hour(self: Self) -> ExprT: - """Extract hour from underlying DateTime representation. - - Returns the hour number from 0 to 23. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1), - ... datetime(2024, 10, 13, 5), - ... datetime(2065, 1, 1, 10), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_hour(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_hour`: - - >>> agnostic_dt_hour(df_pd) - datetime hour - 0 1978-01-01 01:00:00 1 - 1 2024-10-13 05:00:00 5 - 2 2065-01-01 10:00:00 10 - - >>> agnostic_dt_hour(df_pl) - shape: (3, 2) - ┌─────────────────────┬──────┐ - │ datetime ┆ hour │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪══════╡ - │ 1978-01-01 01:00:00 ┆ 1 │ - │ 2024-10-13 05:00:00 ┆ 5 │ - │ 2065-01-01 10:00:00 ┆ 10 │ - └─────────────────────┴──────┘ - - >>> agnostic_dt_hour(df_pa) - pyarrow.Table - datetime: timestamp[us] - hour: int64 - ---- - datetime: [[1978-01-01 01:00:00.000000,2024-10-13 05:00:00.000000,2065-01-01 10:00:00.000000]] - hour: [[1,5,10]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.hour() - ) - - def minute(self: Self) -> ExprT: - """Extract minutes from underlying DateTime representation. - - Returns the minute number from 0 to 59. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1, 1), - ... datetime(2024, 10, 13, 5, 30), - ... datetime(2065, 1, 1, 10, 20), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_minute(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.minute().alias("minute"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_minute`: - - >>> agnostic_dt_minute(df_pd) - datetime minute - 0 1978-01-01 01:01:00 1 - 1 2024-10-13 05:30:00 30 - 2 2065-01-01 10:20:00 20 - - >>> agnostic_dt_minute(df_pl) - shape: (3, 2) - ┌─────────────────────┬────────┐ - │ datetime ┆ minute │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪════════╡ - │ 1978-01-01 01:01:00 ┆ 1 │ - │ 2024-10-13 05:30:00 ┆ 30 │ - │ 2065-01-01 10:20:00 ┆ 20 │ - └─────────────────────┴────────┘ - - >>> agnostic_dt_minute(df_pa) - pyarrow.Table - datetime: timestamp[us] - minute: int64 - ---- - datetime: [[1978-01-01 01:01:00.000000,2024-10-13 05:30:00.000000,2065-01-01 10:20:00.000000]] - minute: [[1,30,20]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.minute() - ) - - def second(self: Self) -> ExprT: - """Extract seconds from underlying DateTime representation. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1, 1, 1), - ... datetime(2024, 10, 13, 5, 30, 14), - ... datetime(2065, 1, 1, 10, 20, 30), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_second(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.second().alias("second"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_second`: - - >>> agnostic_dt_second(df_pd) - datetime second - 0 1978-01-01 01:01:01 1 - 1 2024-10-13 05:30:14 14 - 2 2065-01-01 10:20:30 30 - - >>> agnostic_dt_second(df_pl) - shape: (3, 2) - ┌─────────────────────┬────────┐ - │ datetime ┆ second │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪════════╡ - │ 1978-01-01 01:01:01 ┆ 1 │ - │ 2024-10-13 05:30:14 ┆ 14 │ - │ 2065-01-01 10:20:30 ┆ 30 │ - └─────────────────────┴────────┘ - - >>> agnostic_dt_second(df_pa) - pyarrow.Table - datetime: timestamp[us] - second: int64 - ---- - datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.000000,2065-01-01 10:20:30.000000]] - second: [[1,14,30]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.second() - ) - - def millisecond(self: Self) -> ExprT: - """Extract milliseconds from underlying DateTime representation. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1, 1, 1, 0), - ... datetime(2024, 10, 13, 5, 30, 14, 505000), - ... datetime(2065, 1, 1, 10, 20, 30, 67000), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_millisecond(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.millisecond().alias("millisecond"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_millisecond`: - - >>> agnostic_dt_millisecond(df_pd) - datetime millisecond - 0 1978-01-01 01:01:01.000 0 - 1 2024-10-13 05:30:14.505 505 - 2 2065-01-01 10:20:30.067 67 - - >>> agnostic_dt_millisecond(df_pl) - shape: (3, 2) - ┌─────────────────────────┬─────────────┐ - │ datetime ┆ millisecond │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i32 │ - ╞═════════════════════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 505 │ - │ 2065-01-01 10:20:30.067 ┆ 67 │ - └─────────────────────────┴─────────────┘ - - >>> agnostic_dt_millisecond(df_pa) - pyarrow.Table - datetime: timestamp[us] - millisecond: int64 - ---- - datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] - millisecond: [[0,505,67]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.millisecond() - ) - - def microsecond(self: Self) -> ExprT: - """Extract microseconds from underlying DateTime representation. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1, 1, 1, 0), - ... datetime(2024, 10, 13, 5, 30, 14, 505000), - ... datetime(2065, 1, 1, 10, 20, 30, 67000), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_microsecond(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.microsecond().alias("microsecond"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_microsecond`: - - >>> agnostic_dt_microsecond(df_pd) - datetime microsecond - 0 1978-01-01 01:01:01.000 0 - 1 2024-10-13 05:30:14.505 505000 - 2 2065-01-01 10:20:30.067 67000 - - >>> agnostic_dt_microsecond(df_pl) - shape: (3, 2) - ┌─────────────────────────┬─────────────┐ - │ datetime ┆ microsecond │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i32 │ - ╞═════════════════════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 505000 │ - │ 2065-01-01 10:20:30.067 ┆ 67000 │ - └─────────────────────────┴─────────────┘ - - >>> agnostic_dt_microsecond(df_pa) - pyarrow.Table - datetime: timestamp[us] - microsecond: int64 - ---- - datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] - microsecond: [[0,505000,67000]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.microsecond() - ) - - def nanosecond(self: Self) -> ExprT: - """Extract Nanoseconds from underlying DateTime representation. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "datetime": [ - ... datetime(1978, 1, 1, 1, 1, 1, 0), - ... datetime(2024, 10, 13, 5, 30, 14, 500000), - ... datetime(2065, 1, 1, 10, 20, 30, 60000), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_nanosecond(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("datetime").dt.nanosecond().alias("nanosecond"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_nanosecond`: - - >>> agnostic_dt_nanosecond(df_pd) - datetime nanosecond - 0 1978-01-01 01:01:01.000 0 - 1 2024-10-13 05:30:14.500 500000000 - 2 2065-01-01 10:20:30.060 60000000 - - >>> agnostic_dt_nanosecond(df_pl) - shape: (3, 2) - ┌─────────────────────────┬────────────┐ - │ datetime ┆ nanosecond │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i32 │ - ╞═════════════════════════╪════════════╡ - │ 1978-01-01 01:01:01 ┆ 0 │ - │ 2024-10-13 05:30:14.500 ┆ 500000000 │ - │ 2065-01-01 10:20:30.060 ┆ 60000000 │ - └─────────────────────────┴────────────┘ - - >>> agnostic_dt_nanosecond(df_pa) - pyarrow.Table - datetime: timestamp[us] - nanosecond: int64 - ---- - datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.500000,2065-01-01 10:20:30.060000]] - nanosecond: [[0,500000000,60000000]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.nanosecond() - ) - - def ordinal_day(self: Self) -> ExprT: - """Get ordinal day. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_ordinal_day(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_ordinal_day=nw.col("a").dt.ordinal_day() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_ordinal_day`: - - >>> agnostic_dt_ordinal_day(df_pd) - a a_ordinal_day - 0 2020-01-01 1 - 1 2020-08-03 216 - - >>> agnostic_dt_ordinal_day(df_pl) - shape: (2, 2) - ┌─────────────────────┬───────────────┐ - │ a ┆ a_ordinal_day │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i16 │ - ╞═════════════════════╪═══════════════╡ - │ 2020-01-01 00:00:00 ┆ 1 │ - │ 2020-08-03 00:00:00 ┆ 216 │ - └─────────────────────┴───────────────┘ - - >>> agnostic_dt_ordinal_day(df_pa) - pyarrow.Table - a: timestamp[us] - a_ordinal_day: int64 - ---- - a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] - a_ordinal_day: [[1,216]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.ordinal_day() - ) - - def weekday(self: Self) -> ExprT: - """Extract the week day from the underlying Date representation. - - Returns: - Returns the ISO weekday number where monday = 1 and sunday = 7 - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_weekday(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns(a_weekday=nw.col("a").dt.weekday()).to_native() - - We can then pass either pandas, Polars, PyArrow, and other supported libraries to - `agnostic_dt_weekday`: - - >>> agnostic_dt_weekday(df_pd) - a a_weekday - 0 2020-01-01 3 - 1 2020-08-03 1 - - >>> agnostic_dt_weekday(df_pl) - shape: (2, 2) - ┌─────────────────────┬───────────┐ - │ a ┆ a_weekday │ - │ --- ┆ --- │ - │ datetime[μs] ┆ i8 │ - ╞═════════════════════╪═══════════╡ - │ 2020-01-01 00:00:00 ┆ 3 │ - │ 2020-08-03 00:00:00 ┆ 1 │ - └─────────────────────┴───────────┘ - - >>> agnostic_dt_weekday(df_pa) - pyarrow.Table - a: timestamp[us] - a_weekday: int64 - ---- - a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] - a_weekday: [[3,1]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.weekday() - ) - - def total_minutes(self: Self) -> ExprT: - """Get total minutes. - - Returns: - A new expression. - - Notes: - The function outputs the total minutes in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` and `cast` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [timedelta(minutes=10), timedelta(minutes=20, seconds=40)]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_total_minutes(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_total_minutes=nw.col("a").dt.total_minutes() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_total_minutes`: - - >>> agnostic_dt_total_minutes(df_pd) - a a_total_minutes - 0 0 days 00:10:00 10 - 1 0 days 00:20:40 20 - - >>> agnostic_dt_total_minutes(df_pl) - shape: (2, 2) - ┌──────────────┬─────────────────┐ - │ a ┆ a_total_minutes │ - │ --- ┆ --- │ - │ duration[μs] ┆ i64 │ - ╞══════════════╪═════════════════╡ - │ 10m ┆ 10 │ - │ 20m 40s ┆ 20 │ - └──────────────┴─────────────────┘ - - >>> agnostic_dt_total_minutes(df_pa) - pyarrow.Table - a: duration[us] - a_total_minutes: int64 - ---- - a: [[600000000,1240000000]] - a_total_minutes: [[10,20]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.total_minutes() - ) - - def total_seconds(self: Self) -> ExprT: - """Get total seconds. - - Returns: - A new expression. - - Notes: - The function outputs the total seconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` and `cast` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_total_seconds(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_total_seconds=nw.col("a").dt.total_seconds() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_total_seconds`: - - >>> agnostic_dt_total_seconds(df_pd) - a a_total_seconds - 0 0 days 00:00:10 10 - 1 0 days 00:00:20.040000 20 - - >>> agnostic_dt_total_seconds(df_pl) - shape: (2, 2) - ┌──────────────┬─────────────────┐ - │ a ┆ a_total_seconds │ - │ --- ┆ --- │ - │ duration[μs] ┆ i64 │ - ╞══════════════╪═════════════════╡ - │ 10s ┆ 10 │ - │ 20s 40ms ┆ 20 │ - └──────────────┴─────────────────┘ - - >>> agnostic_dt_total_seconds(df_pa) - pyarrow.Table - a: duration[us] - a_total_seconds: int64 - ---- - a: [[10000000,20040000]] - a_total_seconds: [[10,20]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.total_seconds() - ) - - def total_milliseconds(self: Self) -> ExprT: - """Get total milliseconds. - - Returns: - A new expression. - - Notes: - The function outputs the total milliseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` and `cast` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [ - ... timedelta(milliseconds=10), - ... timedelta(milliseconds=20, microseconds=40), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_total_milliseconds(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_total_milliseconds=nw.col("a").dt.total_milliseconds() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_total_milliseconds`: - - >>> agnostic_dt_total_milliseconds(df_pd) - a a_total_milliseconds - 0 0 days 00:00:00.010000 10 - 1 0 days 00:00:00.020040 20 - - >>> agnostic_dt_total_milliseconds(df_pl) - shape: (2, 2) - ┌──────────────┬──────────────────────┐ - │ a ┆ a_total_milliseconds │ - │ --- ┆ --- │ - │ duration[μs] ┆ i64 │ - ╞══════════════╪══════════════════════╡ - │ 10ms ┆ 10 │ - │ 20040µs ┆ 20 │ - └──────────────┴──────────────────────┘ - - >>> agnostic_dt_total_milliseconds(df_pa) - pyarrow.Table - a: duration[us] - a_total_milliseconds: int64 - ---- - a: [[10000,20040]] - a_total_milliseconds: [[10,20]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.total_milliseconds() - ) - - def total_microseconds(self: Self) -> ExprT: - """Get total microseconds. - - Returns: - A new expression. - - Notes: - The function outputs the total microseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` and `cast` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [ - ... timedelta(microseconds=10), - ... timedelta(milliseconds=1, microseconds=200), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_total_microseconds(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_total_microseconds=nw.col("a").dt.total_microseconds() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_total_microseconds`: - - >>> agnostic_dt_total_microseconds(df_pd) - a a_total_microseconds - 0 0 days 00:00:00.000010 10 - 1 0 days 00:00:00.001200 1200 - - >>> agnostic_dt_total_microseconds(df_pl) - shape: (2, 2) - ┌──────────────┬──────────────────────┐ - │ a ┆ a_total_microseconds │ - │ --- ┆ --- │ - │ duration[μs] ┆ i64 │ - ╞══════════════╪══════════════════════╡ - │ 10µs ┆ 10 │ - │ 1200µs ┆ 1200 │ - └──────────────┴──────────────────────┘ - - >>> agnostic_dt_total_microseconds(df_pa) - pyarrow.Table - a: duration[us] - a_total_microseconds: int64 - ---- - a: [[10,1200]] - a_total_microseconds: [[10,1200]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.total_microseconds() - ) - - def total_nanoseconds(self: Self) -> ExprT: - """Get total nanoseconds. - - Returns: - A new expression. - - Notes: - The function outputs the total nanoseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` and `cast` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] - >>> df_pd = pd.DataFrame({"a": pd.to_datetime(data)}) - >>> df_pl = pl.DataFrame({"a": data}).with_columns( - ... pl.col("a").str.to_datetime(time_unit="ns") - ... ) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_total_nanoseconds(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... a_diff_total_nanoseconds=nw.col("a").diff().dt.total_nanoseconds() - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_total_nanoseconds`: - - >>> agnostic_dt_total_nanoseconds(df_pd) - a a_diff_total_nanoseconds - 0 2024-01-01 00:00:00.000000001 NaN - 1 2024-01-01 00:00:00.000000002 1.0 - - >>> agnostic_dt_total_nanoseconds(df_pl) - shape: (2, 2) - ┌───────────────────────────────┬──────────────────────────┐ - │ a ┆ a_diff_total_nanoseconds │ - │ --- ┆ --- │ - │ datetime[ns] ┆ i64 │ - ╞═══════════════════════════════╪══════════════════════════╡ - │ 2024-01-01 00:00:00.000000001 ┆ null │ - │ 2024-01-01 00:00:00.000000002 ┆ 1 │ - └───────────────────────────────┴──────────────────────────┘ - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.total_nanoseconds() - ) - - def to_string(self: Self, format: str) -> ExprT: # noqa: A002 - """Convert a Date/Time/Datetime column into a String column with the given format. - - Arguments: - format: Format to format temporal column with. - - Returns: - A new expression. - - Notes: - Unfortunately, different libraries interpret format directives a bit - differently. - - - Chrono, the library used by Polars, uses `"%.f"` for fractional seconds, - whereas pandas and Python stdlib use `".%f"`. - - PyArrow interprets `"%S"` as "seconds, including fractional seconds" - whereas most other tools interpret it as "just seconds, as 2 digits". - - Therefore, we make the following adjustments: - - - for pandas-like libraries, we replace `"%S.%f"` with `"%S%.f"`. - - for PyArrow, we replace `"%S.%f"` with `"%S"`. - - Workarounds like these don't make us happy, and we try to avoid them as - much as possible, but here we feel like it's the best compromise. - - If you just want to format a date/datetime Series as a local datetime - string, and have it work as consistently as possible across libraries, - we suggest using: - - - `"%Y-%m-%dT%H:%M:%S%.f"` for datetimes - - `"%Y-%m-%d"` for dates - - though note that, even then, different tools may return a different number - of trailing zeros. Nonetheless, this is probably consistent enough for - most applications. - - If you have an application where this is not enough, please open an issue - and let us know. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_dt_to_string(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("a").dt.to_string("%Y/%m/%d %H:%M:%S") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_to_string`: - - >>> agnostic_dt_to_string(df_pd) - a - 0 2020/03/01 00:00:00 - 1 2020/04/01 00:00:00 - 2 2020/05/01 00:00:00 - - >>> agnostic_dt_to_string(df_pl) - shape: (3, 1) - ┌─────────────────────┐ - │ a │ - │ --- │ - │ str │ - ╞═════════════════════╡ - │ 2020/03/01 00:00:00 │ - │ 2020/04/01 00:00:00 │ - │ 2020/05/01 00:00:00 │ - └─────────────────────┘ - - >>> agnostic_dt_to_string(df_pa) - pyarrow.Table - a: string - ---- - a: [["2020/03/01 00:00:00.000000","2020/04/01 00:00:00.000000","2020/05/01 00:00:00.000000"]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.to_string(format) - ) - - def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: - """Replace time zone. - - Arguments: - time_zone: Target time zone. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime, timezone - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [ - ... datetime(2024, 1, 1, tzinfo=timezone.utc), - ... datetime(2024, 1, 2, tzinfo=timezone.utc), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_dt_replace_time_zone(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("a").dt.replace_time_zone("Asia/Kathmandu") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_replace_time_zone`: - - >>> agnostic_dt_replace_time_zone(df_pd) - a - 0 2024-01-01 00:00:00+05:45 - 1 2024-01-02 00:00:00+05:45 - - >>> agnostic_dt_replace_time_zone(df_pl) - shape: (2, 1) - ┌──────────────────────────────┐ - │ a │ - │ --- │ - │ datetime[μs, Asia/Kathmandu] │ - ╞══════════════════════════════╡ - │ 2024-01-01 00:00:00 +0545 │ - │ 2024-01-02 00:00:00 +0545 │ - └──────────────────────────────┘ - - >>> agnostic_dt_replace_time_zone(df_pa) - pyarrow.Table - a: timestamp[us, tz=Asia/Kathmandu] - ---- - a: [[2023-12-31 18:15:00.000000Z,2024-01-01 18:15:00.000000Z]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.replace_time_zone(time_zone) - ) - - def convert_time_zone(self: Self, time_zone: str) -> ExprT: - """Convert to a new time zone. - - If converting from a time-zone-naive column, then conversion happens - as if converting from UTC. - - Arguments: - time_zone: Target time zone. - - Returns: - A new expression. - - Examples: - >>> from datetime import datetime, timezone - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = { - ... "a": [ - ... datetime(2024, 1, 1, tzinfo=timezone.utc), - ... datetime(2024, 1, 2, tzinfo=timezone.utc), - ... ] - ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_dt_convert_time_zone(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("a").dt.convert_time_zone("Asia/Kathmandu") - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_convert_time_zone`: - - >>> agnostic_dt_convert_time_zone(df_pd) - a - 0 2024-01-01 05:45:00+05:45 - 1 2024-01-02 05:45:00+05:45 - - >>> agnostic_dt_convert_time_zone(df_pl) - shape: (2, 1) - ┌──────────────────────────────┐ - │ a │ - │ --- │ - │ datetime[μs, Asia/Kathmandu] │ - ╞══════════════════════════════╡ - │ 2024-01-01 05:45:00 +0545 │ - │ 2024-01-02 05:45:00 +0545 │ - └──────────────────────────────┘ - - >>> agnostic_dt_convert_time_zone(df_pa) - pyarrow.Table - a: timestamp[us, tz=Asia/Kathmandu] - ---- - a: [[2024-01-01 00:00:00.000000Z,2024-01-02 00:00:00.000000Z]] - """ - if time_zone is None: - msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." - raise TypeError(msg) - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.convert_time_zone(time_zone) - ) - - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: - """Return a timestamp in the given time unit. - - Arguments: - time_unit: {'ns', 'us', 'ms'} - Time unit. - - Returns: - A new expression. - - Examples: - >>> from datetime import date - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]} - >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_dt_timestamp(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... nw.col("date").dt.timestamp().alias("timestamp_us"), - ... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"), - ... ).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_dt_timestamp`: - - >>> agnostic_dt_timestamp(df_pd) - date timestamp_us timestamp_ms - 0 2001-01-01 9.783072e+14 9.783072e+11 - 1 NaT NaN NaN - 2 2001-01-03 9.784800e+14 9.784800e+11 - - >>> agnostic_dt_timestamp(df_pl) - shape: (3, 3) - ┌────────────┬─────────────────┬──────────────┐ - │ date ┆ timestamp_us ┆ timestamp_ms │ - │ --- ┆ --- ┆ --- │ - │ date ┆ i64 ┆ i64 │ - ╞════════════╪═════════════════╪══════════════╡ - │ 2001-01-01 ┆ 978307200000000 ┆ 978307200000 │ - │ null ┆ null ┆ null │ - │ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 │ - └────────────┴─────────────────┴──────────────┘ - - >>> agnostic_dt_timestamp(df_pa) - pyarrow.Table - date: date32[day] - timestamp_us: int64 - timestamp_ms: int64 - ---- - date: [[2001-01-01,null,2001-01-03]] - timestamp_us: [[978307200000000,null,978480000000000]] - timestamp_ms: [[978307200000,null,978480000000]] - """ - if time_unit not in {"ns", "us", "ms"}: - msg = ( - "invalid `time_unit`" - f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." - ) - raise ValueError(msg) - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).dt.timestamp(time_unit) - ) - - -class ExprNameNamespace(Generic[ExprT]): - def __init__(self: Self, expr: ExprT) -> None: - self._expr = expr - - def keep(self: Self) -> ExprT: - r"""Keep the original root name of the expression. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_name_keep(df_native: IntoFrame) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo").alias("alias_for_foo").name.keep()).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_keep`: - - >>> agnostic_name_keep(df_pd) - ['foo'] - - >>> agnostic_name_keep(df_pl) - ['foo'] - - >>> agnostic_name_keep(df_pa) - ['foo'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.keep() - ) - - def map(self: Self, function: Callable[[str], str]) -> ExprT: - r"""Rename the output of an expression by mapping a function over the root name. - - Arguments: - function: Function that maps a root name to a new name. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> renaming_func = lambda s: s[::-1] # reverse column name - >>> def agnostic_name_map(df_native: IntoFrame) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_map`: - - >>> agnostic_name_map(df_pd) - ['oof', 'RAB'] - - >>> agnostic_name_map(df_pl) - ['oof', 'RAB'] - - >>> agnostic_name_map(df_pa) - ['oof', 'RAB'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.map(function) - ) - - def prefix(self: Self, prefix: str) -> ExprT: - r"""Add a prefix to the root column name of the expression. - - Arguments: - prefix: Prefix to add to the root column name. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_name_prefix(df_native: IntoFrame, prefix: str) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_prefix`: - - >>> agnostic_name_prefix(df_pd, "with_prefix_") - ['with_prefix_foo', 'with_prefix_BAR'] - - >>> agnostic_name_prefix(df_pl, "with_prefix_") - ['with_prefix_foo', 'with_prefix_BAR'] - - >>> agnostic_name_prefix(df_pa, "with_prefix_") - ['with_prefix_foo', 'with_prefix_BAR'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.prefix(prefix) - ) - - def suffix(self: Self, suffix: str) -> ExprT: - r"""Add a suffix to the root column name of the expression. - - Arguments: - suffix: Suffix to add to the root column name. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_name_suffix(df_native: IntoFrame, suffix: str) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_suffix`: - - >>> agnostic_name_suffix(df_pd, "_with_suffix") - ['foo_with_suffix', 'BAR_with_suffix'] - - >>> agnostic_name_suffix(df_pl, "_with_suffix") - ['foo_with_suffix', 'BAR_with_suffix'] - - >>> agnostic_name_suffix(df_pa, "_with_suffix") - ['foo_with_suffix', 'BAR_with_suffix'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.suffix(suffix) - ) - - def to_lowercase(self: Self) -> ExprT: - r"""Make the root column name lowercase. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_name_to_lowercase(df_native: IntoFrame) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_to_lowercase`: - - >>> agnostic_name_to_lowercase(df_pd) - ['foo', 'bar'] - - >>> agnostic_name_to_lowercase(df_pl) - ['foo', 'bar'] - - >>> agnostic_name_to_lowercase(df_pa) - ['foo', 'bar'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.to_lowercase() - ) - - def to_uppercase(self: Self) -> ExprT: - r"""Make the root column name uppercase. - - Returns: - A new expression. - - Notes: - This will undo any previous renaming operations on the expression. - Due to implementation constraints, this method can only be called as the last - expression in a chain. Only one name operation per expression will work. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> data = {"foo": [1, 2], "BAR": [4, 5]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a dataframe-agnostic function: - - >>> def agnostic_name_to_uppercase(df_native: IntoFrame) -> list[str]: - ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).columns - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_name_to_uppercase`: - - >>> agnostic_name_to_uppercase(df_pd) - ['FOO', 'BAR'] - - >>> agnostic_name_to_uppercase(df_pl) - ['FOO', 'BAR'] - - >>> agnostic_name_to_uppercase(df_pa) - ['FOO', 'BAR'] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).name.to_uppercase() - ) - - -class ExprListNamespace(Generic[ExprT]): - def __init__(self: Self, expr: ExprT) -> None: - self._expr = expr - - def len(self: Self) -> ExprT: - """Return the number of elements in each list. - - Null values count towards the total. - - Returns: - A new expression. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> - >>> data = {"a": [[1, 2], [3, 4, None], None, []]} - - Let's define a dataframe-agnostic function: - - >>> def agnostic_list_len(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns(a_len=nw.col("a").list.len()).to_native() - - We can then pass pandas / PyArrow / Polars / any other supported library: - - >>> agnostic_list_len( - ... pd.DataFrame(data).astype({"a": pd.ArrowDtype(pa.list_(pa.int64()))}) - ... ) # doctest: +SKIP - a a_len - 0 [1. 2.] 2 - 1 [ 3. 4. nan] 3 - 2 - 3 [] 0 - - >>> agnostic_list_len(pl.DataFrame(data)) - shape: (4, 2) - ┌──────────────┬───────┐ - │ a ┆ a_len │ - │ --- ┆ --- │ - │ list[i64] ┆ u32 │ - ╞══════════════╪═══════╡ - │ [1, 2] ┆ 2 │ - │ [3, 4, null] ┆ 3 │ - │ null ┆ null │ - │ [] ┆ 0 │ - └──────────────┴───────┘ - - >>> agnostic_list_len(pa.table(data)) - pyarrow.Table - a: list - child 0, item: int64 - a_len: uint32 - ---- - a: [[[1,2],[3,4,null],null,[]]] - a_len: [[2,3,null,0]] - """ - return self._expr.__class__( - lambda plx: self._expr._to_compliant_expr(plx).list.len() - ) - - def col(*names: str | Iterable[str]) -> Expr: """Creates an expression that references one or more columns by their name(s). diff --git a/narwhals/expr_cat.py b/narwhals/expr_cat.py new file mode 100644 index 000000000..ada8e3a45 --- /dev/null +++ b/narwhals/expr_cat.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprCatNamespace(Generic[ExprT]): + def __init__(self: Self, expr: ExprT) -> None: + self._expr = expr + + def get_categories(self: Self) -> ExprT: + """Get unique categories from column. + + Returns: + A new expression. + + Examples: + Let's create some dataframes: + + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"fruits": ["apple", "mango", "mango"]} + >>> df_pd = pd.DataFrame(data, dtype="category") + >>> df_pl = pl.DataFrame(data, schema={"fruits": pl.Categorical}) + + We define a dataframe-agnostic function to get unique categories + from column 'fruits': + + >>> def agnostic_cat_get_categories(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("fruits").cat.get_categories()).to_native() + + We can then pass any supported library such as pandas or Polars to + `agnostic_cat_get_categories`: + + >>> agnostic_cat_get_categories(df_pd) + fruits + 0 apple + 1 mango + + >>> agnostic_cat_get_categories(df_pl) + shape: (2, 1) + ┌────────┐ + │ fruits │ + │ --- │ + │ str │ + ╞════════╡ + │ apple │ + │ mango │ + └────────┘ + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).cat.get_categories() + ) diff --git a/narwhals/expr_dt.py b/narwhals/expr_dt.py new file mode 100644 index 000000000..d0676dd9b --- /dev/null +++ b/narwhals/expr_dt.py @@ -0,0 +1,1415 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Generic +from typing import Literal +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprDateTimeNamespace(Generic[ExprT]): + def __init__(self: Self, expr: ExprT) -> None: + self._expr = expr + + def date(self: Self) -> ExprT: + """Extract the date from underlying DateTime representation. + + Returns: + A new expression. + + Raises: + NotImplementedError: If pandas default backend is being used. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)]} + >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> def agnostic_dt_date(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a").dt.date()).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_date`: + + >>> agnostic_dt_date(df_pd) + a + 0 2012-01-07 + 1 2023-03-10 + + >>> agnostic_dt_date(df_pl) + shape: (2, 1) + ┌────────────┐ + │ a │ + │ --- │ + │ date │ + ╞════════════╡ + │ 2012-01-07 │ + │ 2023-03-10 │ + └────────────┘ + + >>> agnostic_dt_date(df_pa) + pyarrow.Table + a: date32[day] + ---- + a: [[2012-01-07,2023-03-10]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.date() + ) + + def year(self: Self) -> ExprT: + """Extract year from underlying DateTime representation. + + Returns the year number in the calendar date. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 6, 1), + ... datetime(2024, 12, 13), + ... datetime(2065, 1, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_year(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.year().alias("year") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_year`: + + >>> agnostic_dt_year(df_pd) + datetime year + 0 1978-06-01 1978 + 1 2024-12-13 2024 + 2 2065-01-01 2065 + + >>> agnostic_dt_year(df_pl) + shape: (3, 2) + ┌─────────────────────┬──────┐ + │ datetime ┆ year │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════╪══════╡ + │ 1978-06-01 00:00:00 ┆ 1978 │ + │ 2024-12-13 00:00:00 ┆ 2024 │ + │ 2065-01-01 00:00:00 ┆ 2065 │ + └─────────────────────┴──────┘ + + >>> agnostic_dt_year(df_pa) + pyarrow.Table + datetime: timestamp[us] + year: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + year: [[1978,2024,2065]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.year() + ) + + def month(self: Self) -> ExprT: + """Extract month from underlying DateTime representation. + + Returns the month number starting from 1. The return value ranges from 1 to 12. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 6, 1), + ... datetime(2024, 12, 13), + ... datetime(2065, 1, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_month(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.month().alias("month"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_month`: + + >>> agnostic_dt_month(df_pd) + datetime month + 0 1978-06-01 6 + 1 2024-12-13 12 + 2 2065-01-01 1 + + >>> agnostic_dt_month(df_pl) + shape: (3, 2) + ┌─────────────────────┬───────┐ + │ datetime ┆ month │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═══════╡ + │ 1978-06-01 00:00:00 ┆ 6 │ + │ 2024-12-13 00:00:00 ┆ 12 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴───────┘ + + >>> agnostic_dt_month(df_pa) + pyarrow.Table + datetime: timestamp[us] + month: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + month: [[6,12,1]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.month() + ) + + def day(self: Self) -> ExprT: + """Extract day from underlying DateTime representation. + + Returns the day of month starting from 1. The return value ranges from 1 to 31. (The last day of month differs by months.) + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 6, 1), + ... datetime(2024, 12, 13), + ... datetime(2065, 1, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_day(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.day().alias("day"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_day`: + + >>> agnostic_dt_day(df_pd) + datetime day + 0 1978-06-01 1 + 1 2024-12-13 13 + 2 2065-01-01 1 + + >>> agnostic_dt_day(df_pl) + shape: (3, 2) + ┌─────────────────────┬─────┐ + │ datetime ┆ day │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═════╡ + │ 1978-06-01 00:00:00 ┆ 1 │ + │ 2024-12-13 00:00:00 ┆ 13 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴─────┘ + + >>> agnostic_dt_day(df_pa) + pyarrow.Table + datetime: timestamp[us] + day: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + day: [[1,13,1]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.day() + ) + + def hour(self: Self) -> ExprT: + """Extract hour from underlying DateTime representation. + + Returns the hour number from 0 to 23. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1), + ... datetime(2024, 10, 13, 5), + ... datetime(2065, 1, 1, 10), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_hour(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.hour().alias("hour") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_hour`: + + >>> agnostic_dt_hour(df_pd) + datetime hour + 0 1978-01-01 01:00:00 1 + 1 2024-10-13 05:00:00 5 + 2 2065-01-01 10:00:00 10 + + >>> agnostic_dt_hour(df_pl) + shape: (3, 2) + ┌─────────────────────┬──────┐ + │ datetime ┆ hour │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪══════╡ + │ 1978-01-01 01:00:00 ┆ 1 │ + │ 2024-10-13 05:00:00 ┆ 5 │ + │ 2065-01-01 10:00:00 ┆ 10 │ + └─────────────────────┴──────┘ + + >>> agnostic_dt_hour(df_pa) + pyarrow.Table + datetime: timestamp[us] + hour: int64 + ---- + datetime: [[1978-01-01 01:00:00.000000,2024-10-13 05:00:00.000000,2065-01-01 10:00:00.000000]] + hour: [[1,5,10]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.hour() + ) + + def minute(self: Self) -> ExprT: + """Extract minutes from underlying DateTime representation. + + Returns the minute number from 0 to 59. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1), + ... datetime(2024, 10, 13, 5, 30), + ... datetime(2065, 1, 1, 10, 20), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_minute(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.minute().alias("minute"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_minute`: + + >>> agnostic_dt_minute(df_pd) + datetime minute + 0 1978-01-01 01:01:00 1 + 1 2024-10-13 05:30:00 30 + 2 2065-01-01 10:20:00 20 + + >>> agnostic_dt_minute(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ minute │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:00 ┆ 1 │ + │ 2024-10-13 05:30:00 ┆ 30 │ + │ 2065-01-01 10:20:00 ┆ 20 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_minute(df_pa) + pyarrow.Table + datetime: timestamp[us] + minute: int64 + ---- + datetime: [[1978-01-01 01:01:00.000000,2024-10-13 05:30:00.000000,2065-01-01 10:20:00.000000]] + minute: [[1,30,20]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.minute() + ) + + def second(self: Self) -> ExprT: + """Extract seconds from underlying DateTime representation. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1), + ... datetime(2024, 10, 13, 5, 30, 14), + ... datetime(2065, 1, 1, 10, 20, 30), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_second(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.second().alias("second"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_second`: + + >>> agnostic_dt_second(df_pd) + datetime second + 0 1978-01-01 01:01:01 1 + 1 2024-10-13 05:30:14 14 + 2 2065-01-01 10:20:30 30 + + >>> agnostic_dt_second(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ second │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:01 ┆ 1 │ + │ 2024-10-13 05:30:14 ┆ 14 │ + │ 2065-01-01 10:20:30 ┆ 30 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_second(df_pa) + pyarrow.Table + datetime: timestamp[us] + second: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.000000,2065-01-01 10:20:30.000000]] + second: [[1,14,30]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.second() + ) + + def millisecond(self: Self) -> ExprT: + """Extract milliseconds from underlying DateTime representation. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1, 0), + ... datetime(2024, 10, 13, 5, 30, 14, 505000), + ... datetime(2065, 1, 1, 10, 20, 30, 67000), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_millisecond(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.millisecond().alias("millisecond"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_millisecond`: + + >>> agnostic_dt_millisecond(df_pd) + datetime millisecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505 + 2 2065-01-01 10:20:30.067 67 + + >>> agnostic_dt_millisecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ millisecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505 │ + │ 2065-01-01 10:20:30.067 ┆ 67 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_millisecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + millisecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + millisecond: [[0,505,67]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.millisecond() + ) + + def microsecond(self: Self) -> ExprT: + """Extract microseconds from underlying DateTime representation. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1, 0), + ... datetime(2024, 10, 13, 5, 30, 14, 505000), + ... datetime(2065, 1, 1, 10, 20, 30, 67000), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_microsecond(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.microsecond().alias("microsecond"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_microsecond`: + + >>> agnostic_dt_microsecond(df_pd) + datetime microsecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505000 + 2 2065-01-01 10:20:30.067 67000 + + >>> agnostic_dt_microsecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ microsecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505000 │ + │ 2065-01-01 10:20:30.067 ┆ 67000 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_microsecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + microsecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + microsecond: [[0,505000,67000]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.microsecond() + ) + + def nanosecond(self: Self) -> ExprT: + """Extract Nanoseconds from underlying DateTime representation. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "datetime": [ + ... datetime(1978, 1, 1, 1, 1, 1, 0), + ... datetime(2024, 10, 13, 5, 30, 14, 500000), + ... datetime(2065, 1, 1, 10, 20, 30, 60000), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_nanosecond(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("datetime").dt.nanosecond().alias("nanosecond"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_nanosecond`: + + >>> agnostic_dt_nanosecond(df_pd) + datetime nanosecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.500 500000000 + 2 2065-01-01 10:20:30.060 60000000 + + >>> agnostic_dt_nanosecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬────────────┐ + │ datetime ┆ nanosecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.500 ┆ 500000000 │ + │ 2065-01-01 10:20:30.060 ┆ 60000000 │ + └─────────────────────────┴────────────┘ + + >>> agnostic_dt_nanosecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + nanosecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.500000,2065-01-01 10:20:30.060000]] + nanosecond: [[0,500000000,60000000]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.nanosecond() + ) + + def ordinal_day(self: Self) -> ExprT: + """Get ordinal day. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_ordinal_day(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_ordinal_day=nw.col("a").dt.ordinal_day() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_ordinal_day`: + + >>> agnostic_dt_ordinal_day(df_pd) + a a_ordinal_day + 0 2020-01-01 1 + 1 2020-08-03 216 + + >>> agnostic_dt_ordinal_day(df_pl) + shape: (2, 2) + ┌─────────────────────┬───────────────┐ + │ a ┆ a_ordinal_day │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i16 │ + ╞═════════════════════╪═══════════════╡ + │ 2020-01-01 00:00:00 ┆ 1 │ + │ 2020-08-03 00:00:00 ┆ 216 │ + └─────────────────────┴───────────────┘ + + >>> agnostic_dt_ordinal_day(df_pa) + pyarrow.Table + a: timestamp[us] + a_ordinal_day: int64 + ---- + a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] + a_ordinal_day: [[1,216]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.ordinal_day() + ) + + def weekday(self: Self) -> ExprT: + """Extract the week day from the underlying Date representation. + + Returns: + Returns the ISO weekday number where monday = 1 and sunday = 7 + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_weekday(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns(a_weekday=nw.col("a").dt.weekday()).to_native() + + We can then pass either pandas, Polars, PyArrow, and other supported libraries to + `agnostic_dt_weekday`: + + >>> agnostic_dt_weekday(df_pd) + a a_weekday + 0 2020-01-01 3 + 1 2020-08-03 1 + + >>> agnostic_dt_weekday(df_pl) + shape: (2, 2) + ┌─────────────────────┬───────────┐ + │ a ┆ a_weekday │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═══════════╡ + │ 2020-01-01 00:00:00 ┆ 3 │ + │ 2020-08-03 00:00:00 ┆ 1 │ + └─────────────────────┴───────────┘ + + >>> agnostic_dt_weekday(df_pa) + pyarrow.Table + a: timestamp[us] + a_weekday: int64 + ---- + a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] + a_weekday: [[3,1]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.weekday() + ) + + def total_minutes(self: Self) -> ExprT: + """Get total minutes. + + Returns: + A new expression. + + Notes: + The function outputs the total minutes in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` and `cast` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [timedelta(minutes=10), timedelta(minutes=20, seconds=40)]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_total_minutes(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_total_minutes=nw.col("a").dt.total_minutes() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_minutes`: + + >>> agnostic_dt_total_minutes(df_pd) + a a_total_minutes + 0 0 days 00:10:00 10 + 1 0 days 00:20:40 20 + + >>> agnostic_dt_total_minutes(df_pl) + shape: (2, 2) + ┌──────────────┬─────────────────┐ + │ a ┆ a_total_minutes │ + │ --- ┆ --- │ + │ duration[μs] ┆ i64 │ + ╞══════════════╪═════════════════╡ + │ 10m ┆ 10 │ + │ 20m 40s ┆ 20 │ + └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_minutes(df_pa) + pyarrow.Table + a: duration[us] + a_total_minutes: int64 + ---- + a: [[600000000,1240000000]] + a_total_minutes: [[10,20]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.total_minutes() + ) + + def total_seconds(self: Self) -> ExprT: + """Get total seconds. + + Returns: + A new expression. + + Notes: + The function outputs the total seconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` and `cast` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_total_seconds(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_total_seconds=nw.col("a").dt.total_seconds() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_seconds`: + + >>> agnostic_dt_total_seconds(df_pd) + a a_total_seconds + 0 0 days 00:00:10 10 + 1 0 days 00:00:20.040000 20 + + >>> agnostic_dt_total_seconds(df_pl) + shape: (2, 2) + ┌──────────────┬─────────────────┐ + │ a ┆ a_total_seconds │ + │ --- ┆ --- │ + │ duration[μs] ┆ i64 │ + ╞══════════════╪═════════════════╡ + │ 10s ┆ 10 │ + │ 20s 40ms ┆ 20 │ + └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_seconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_seconds: int64 + ---- + a: [[10000000,20040000]] + a_total_seconds: [[10,20]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.total_seconds() + ) + + def total_milliseconds(self: Self) -> ExprT: + """Get total milliseconds. + + Returns: + A new expression. + + Notes: + The function outputs the total milliseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` and `cast` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [ + ... timedelta(milliseconds=10), + ... timedelta(milliseconds=20, microseconds=40), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_total_milliseconds(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_total_milliseconds=nw.col("a").dt.total_milliseconds() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_milliseconds`: + + >>> agnostic_dt_total_milliseconds(df_pd) + a a_total_milliseconds + 0 0 days 00:00:00.010000 10 + 1 0 days 00:00:00.020040 20 + + >>> agnostic_dt_total_milliseconds(df_pl) + shape: (2, 2) + ┌──────────────┬──────────────────────┐ + │ a ┆ a_total_milliseconds │ + │ --- ┆ --- │ + │ duration[μs] ┆ i64 │ + ╞══════════════╪══════════════════════╡ + │ 10ms ┆ 10 │ + │ 20040µs ┆ 20 │ + └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_milliseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_milliseconds: int64 + ---- + a: [[10000,20040]] + a_total_milliseconds: [[10,20]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.total_milliseconds() + ) + + def total_microseconds(self: Self) -> ExprT: + """Get total microseconds. + + Returns: + A new expression. + + Notes: + The function outputs the total microseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` and `cast` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [ + ... timedelta(microseconds=10), + ... timedelta(milliseconds=1, microseconds=200), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_total_microseconds(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_total_microseconds=nw.col("a").dt.total_microseconds() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_microseconds`: + + >>> agnostic_dt_total_microseconds(df_pd) + a a_total_microseconds + 0 0 days 00:00:00.000010 10 + 1 0 days 00:00:00.001200 1200 + + >>> agnostic_dt_total_microseconds(df_pl) + shape: (2, 2) + ┌──────────────┬──────────────────────┐ + │ a ┆ a_total_microseconds │ + │ --- ┆ --- │ + │ duration[μs] ┆ i64 │ + ╞══════════════╪══════════════════════╡ + │ 10µs ┆ 10 │ + │ 1200µs ┆ 1200 │ + └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_microseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_microseconds: int64 + ---- + a: [[10,1200]] + a_total_microseconds: [[10,1200]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.total_microseconds() + ) + + def total_nanoseconds(self: Self) -> ExprT: + """Get total nanoseconds. + + Returns: + A new expression. + + Notes: + The function outputs the total nanoseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` and `cast` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] + >>> df_pd = pd.DataFrame({"a": pd.to_datetime(data)}) + >>> df_pl = pl.DataFrame({"a": data}).with_columns( + ... pl.col("a").str.to_datetime(time_unit="ns") + ... ) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_total_nanoseconds(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... a_diff_total_nanoseconds=nw.col("a").diff().dt.total_nanoseconds() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_nanoseconds`: + + >>> agnostic_dt_total_nanoseconds(df_pd) + a a_diff_total_nanoseconds + 0 2024-01-01 00:00:00.000000001 NaN + 1 2024-01-01 00:00:00.000000002 1.0 + + >>> agnostic_dt_total_nanoseconds(df_pl) + shape: (2, 2) + ┌───────────────────────────────┬──────────────────────────┐ + │ a ┆ a_diff_total_nanoseconds │ + │ --- ┆ --- │ + │ datetime[ns] ┆ i64 │ + ╞═══════════════════════════════╪══════════════════════════╡ + │ 2024-01-01 00:00:00.000000001 ┆ null │ + │ 2024-01-01 00:00:00.000000002 ┆ 1 │ + └───────────────────────────────┴──────────────────────────┘ + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.total_nanoseconds() + ) + + def to_string(self: Self, format: str) -> ExprT: # noqa: A002 + """Convert a Date/Time/Datetime column into a String column with the given format. + + Arguments: + format: Format to format temporal column with. + + Returns: + A new expression. + + Notes: + Unfortunately, different libraries interpret format directives a bit + differently. + + - Chrono, the library used by Polars, uses `"%.f"` for fractional seconds, + whereas pandas and Python stdlib use `".%f"`. + - PyArrow interprets `"%S"` as "seconds, including fractional seconds" + whereas most other tools interpret it as "just seconds, as 2 digits". + + Therefore, we make the following adjustments: + + - for pandas-like libraries, we replace `"%S.%f"` with `"%S%.f"`. + - for PyArrow, we replace `"%S.%f"` with `"%S"`. + + Workarounds like these don't make us happy, and we try to avoid them as + much as possible, but here we feel like it's the best compromise. + + If you just want to format a date/datetime Series as a local datetime + string, and have it work as consistently as possible across libraries, + we suggest using: + + - `"%Y-%m-%dT%H:%M:%S%.f"` for datetimes + - `"%Y-%m-%d"` for dates + + though note that, even then, different tools may return a different number + of trailing zeros. Nonetheless, this is probably consistent enough for + most applications. + + If you have an application where this is not enough, please open an issue + and let us know. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [ + ... datetime(2020, 3, 1), + ... datetime(2020, 4, 1), + ... datetime(2020, 5, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_dt_to_string(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select( + ... nw.col("a").dt.to_string("%Y/%m/%d %H:%M:%S") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_to_string`: + + >>> agnostic_dt_to_string(df_pd) + a + 0 2020/03/01 00:00:00 + 1 2020/04/01 00:00:00 + 2 2020/05/01 00:00:00 + + >>> agnostic_dt_to_string(df_pl) + shape: (3, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ str │ + ╞═════════════════════╡ + │ 2020/03/01 00:00:00 │ + │ 2020/04/01 00:00:00 │ + │ 2020/05/01 00:00:00 │ + └─────────────────────┘ + + >>> agnostic_dt_to_string(df_pa) + pyarrow.Table + a: string + ---- + a: [["2020/03/01 00:00:00.000000","2020/04/01 00:00:00.000000","2020/05/01 00:00:00.000000"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.to_string(format) + ) + + def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: + """Replace time zone. + + Arguments: + time_zone: Target time zone. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime, timezone + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_dt_replace_time_zone(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select( + ... nw.col("a").dt.replace_time_zone("Asia/Kathmandu") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_replace_time_zone`: + + >>> agnostic_dt_replace_time_zone(df_pd) + a + 0 2024-01-01 00:00:00+05:45 + 1 2024-01-02 00:00:00+05:45 + + >>> agnostic_dt_replace_time_zone(df_pl) + shape: (2, 1) + ┌──────────────────────────────┐ + │ a │ + │ --- │ + │ datetime[μs, Asia/Kathmandu] │ + ╞══════════════════════════════╡ + │ 2024-01-01 00:00:00 +0545 │ + │ 2024-01-02 00:00:00 +0545 │ + └──────────────────────────────┘ + + >>> agnostic_dt_replace_time_zone(df_pa) + pyarrow.Table + a: timestamp[us, tz=Asia/Kathmandu] + ---- + a: [[2023-12-31 18:15:00.000000Z,2024-01-01 18:15:00.000000Z]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.replace_time_zone(time_zone) + ) + + def convert_time_zone(self: Self, time_zone: str) -> ExprT: + """Convert to a new time zone. + + If converting from a time-zone-naive column, then conversion happens + as if converting from UTC. + + Arguments: + time_zone: Target time zone. + + Returns: + A new expression. + + Examples: + >>> from datetime import datetime, timezone + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = { + ... "a": [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_dt_convert_time_zone(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select( + ... nw.col("a").dt.convert_time_zone("Asia/Kathmandu") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_convert_time_zone`: + + >>> agnostic_dt_convert_time_zone(df_pd) + a + 0 2024-01-01 05:45:00+05:45 + 1 2024-01-02 05:45:00+05:45 + + >>> agnostic_dt_convert_time_zone(df_pl) + shape: (2, 1) + ┌──────────────────────────────┐ + │ a │ + │ --- │ + │ datetime[μs, Asia/Kathmandu] │ + ╞══════════════════════════════╡ + │ 2024-01-01 05:45:00 +0545 │ + │ 2024-01-02 05:45:00 +0545 │ + └──────────────────────────────┘ + + >>> agnostic_dt_convert_time_zone(df_pa) + pyarrow.Table + a: timestamp[us, tz=Asia/Kathmandu] + ---- + a: [[2024-01-01 00:00:00.000000Z,2024-01-02 00:00:00.000000Z]] + """ + if time_zone is None: + msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." + raise TypeError(msg) + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.convert_time_zone(time_zone) + ) + + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: + """Return a timestamp in the given time unit. + + Arguments: + time_unit: {'ns', 'us', 'ms'} + Time unit. + + Returns: + A new expression. + + Examples: + >>> from datetime import date + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]} + >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_dt_timestamp(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... nw.col("date").dt.timestamp().alias("timestamp_us"), + ... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_timestamp`: + + >>> agnostic_dt_timestamp(df_pd) + date timestamp_us timestamp_ms + 0 2001-01-01 9.783072e+14 9.783072e+11 + 1 NaT NaN NaN + 2 2001-01-03 9.784800e+14 9.784800e+11 + + >>> agnostic_dt_timestamp(df_pl) + shape: (3, 3) + ┌────────────┬─────────────────┬──────────────┐ + │ date ┆ timestamp_us ┆ timestamp_ms │ + │ --- ┆ --- ┆ --- │ + │ date ┆ i64 ┆ i64 │ + ╞════════════╪═════════════════╪══════════════╡ + │ 2001-01-01 ┆ 978307200000000 ┆ 978307200000 │ + │ null ┆ null ┆ null │ + │ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 │ + └────────────┴─────────────────┴──────────────┘ + + >>> agnostic_dt_timestamp(df_pa) + pyarrow.Table + date: date32[day] + timestamp_us: int64 + timestamp_ms: int64 + ---- + date: [[2001-01-01,null,2001-01-03]] + timestamp_us: [[978307200000000,null,978480000000000]] + timestamp_ms: [[978307200000,null,978480000000]] + """ + if time_unit not in {"ns", "us", "ms"}: + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." + ) + raise ValueError(msg) + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).dt.timestamp(time_unit) + ) diff --git a/narwhals/expr_list.py b/narwhals/expr_list.py new file mode 100644 index 000000000..17efeaf29 --- /dev/null +++ b/narwhals/expr_list.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprListNamespace(Generic[ExprT]): + def __init__(self: Self, expr: ExprT) -> None: + self._expr = expr + + def len(self: Self) -> ExprT: + """Return the number of elements in each list. + + Null values count towards the total. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [[1, 2], [3, 4, None], None, []]} + + Let's define a dataframe-agnostic function: + + >>> def agnostic_list_len(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns(a_len=nw.col("a").list.len()).to_native() + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> agnostic_list_len( + ... pd.DataFrame(data).astype({"a": pd.ArrowDtype(pa.list_(pa.int64()))}) + ... ) # doctest: +SKIP + a a_len + 0 [1. 2.] 2 + 1 [ 3. 4. nan] 3 + 2 + 3 [] 0 + + >>> agnostic_list_len(pl.DataFrame(data)) + shape: (4, 2) + ┌──────────────┬───────┐ + │ a ┆ a_len │ + │ --- ┆ --- │ + │ list[i64] ┆ u32 │ + ╞══════════════╪═══════╡ + │ [1, 2] ┆ 2 │ + │ [3, 4, null] ┆ 3 │ + │ null ┆ null │ + │ [] ┆ 0 │ + └──────────────┴───────┘ + + >>> agnostic_list_len(pa.table(data)) + pyarrow.Table + a: list + child 0, item: int64 + a_len: uint32 + ---- + a: [[[1,2],[3,4,null],null,[]]] + a_len: [[2,3,null,0]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).list.len() + ) diff --git a/narwhals/expr_name.py b/narwhals/expr_name.py new file mode 100644 index 000000000..312a2bc9c --- /dev/null +++ b/narwhals/expr_name.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Callable +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprNameNamespace(Generic[ExprT]): + def __init__(self: Self, expr: ExprT) -> None: + self._expr = expr + + def keep(self: Self) -> ExprT: + r"""Keep the original root name of the expression. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_name_keep(df_native: IntoFrame) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo").alias("alias_for_foo").name.keep()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_keep`: + + >>> agnostic_name_keep(df_pd) + ['foo'] + + >>> agnostic_name_keep(df_pl) + ['foo'] + + >>> agnostic_name_keep(df_pa) + ['foo'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.keep() + ) + + def map(self: Self, function: Callable[[str], str]) -> ExprT: + r"""Rename the output of an expression by mapping a function over the root name. + + Arguments: + function: Function that maps a root name to a new name. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> renaming_func = lambda s: s[::-1] # reverse column name + >>> def agnostic_name_map(df_native: IntoFrame) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_map`: + + >>> agnostic_name_map(df_pd) + ['oof', 'RAB'] + + >>> agnostic_name_map(df_pl) + ['oof', 'RAB'] + + >>> agnostic_name_map(df_pa) + ['oof', 'RAB'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.map(function) + ) + + def prefix(self: Self, prefix: str) -> ExprT: + r"""Add a prefix to the root column name of the expression. + + Arguments: + prefix: Prefix to add to the root column name. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_name_prefix(df_native: IntoFrame, prefix: str) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_prefix`: + + >>> agnostic_name_prefix(df_pd, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] + + >>> agnostic_name_prefix(df_pl, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] + + >>> agnostic_name_prefix(df_pa, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.prefix(prefix) + ) + + def suffix(self: Self, suffix: str) -> ExprT: + r"""Add a suffix to the root column name of the expression. + + Arguments: + suffix: Suffix to add to the root column name. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_name_suffix(df_native: IntoFrame, suffix: str) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_suffix`: + + >>> agnostic_name_suffix(df_pd, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + + >>> agnostic_name_suffix(df_pl, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + + >>> agnostic_name_suffix(df_pa, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.suffix(suffix) + ) + + def to_lowercase(self: Self) -> ExprT: + r"""Make the root column name lowercase. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_name_to_lowercase(df_native: IntoFrame) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_lowercase`: + + >>> agnostic_name_to_lowercase(df_pd) + ['foo', 'bar'] + + >>> agnostic_name_to_lowercase(df_pl) + ['foo', 'bar'] + + >>> agnostic_name_to_lowercase(df_pa) + ['foo', 'bar'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.to_lowercase() + ) + + def to_uppercase(self: Self) -> ExprT: + r"""Make the root column name uppercase. + + Returns: + A new expression. + + Notes: + This will undo any previous renaming operations on the expression. + Due to implementation constraints, this method can only be called as the last + expression in a chain. Only one name operation per expression will work. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"foo": [1, 2], "BAR": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_name_to_uppercase(df_native: IntoFrame) -> list[str]: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_uppercase`: + + >>> agnostic_name_to_uppercase(df_pd) + ['FOO', 'BAR'] + + >>> agnostic_name_to_uppercase(df_pl) + ['FOO', 'BAR'] + + >>> agnostic_name_to_uppercase(df_pa) + ['FOO', 'BAR'] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).name.to_uppercase() + ) diff --git a/narwhals/expr_str.py b/narwhals/expr_str.py new file mode 100644 index 000000000..91d355c66 --- /dev/null +++ b/narwhals/expr_str.py @@ -0,0 +1,891 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.expr import Expr + +ExprT = TypeVar("ExprT", bound="Expr") + + +class ExprStringNamespace(Generic[ExprT]): + def __init__(self: Self, expr: ExprT) -> None: + self._expr = expr + + def len_chars(self: Self) -> ExprT: + r"""Return the length of each string as the number of characters. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"words": ["foo", "Café", "345", "東京", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_len_chars(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... words_len=nw.col("words").str.len_chars() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_len_chars`: + + >>> agnostic_str_len_chars(df_pd) + words words_len + 0 foo 3.0 + 1 Café 4.0 + 2 345 3.0 + 3 東京 2.0 + 4 None NaN + + >>> agnostic_str_len_chars(df_pl) + shape: (5, 2) + ┌───────┬───────────┐ + │ words ┆ words_len │ + │ --- ┆ --- │ + │ str ┆ u32 │ + ╞═══════╪═══════════╡ + │ foo ┆ 3 │ + │ Café ┆ 4 │ + │ 345 ┆ 3 │ + │ 東京 ┆ 2 │ + │ null ┆ null │ + └───────┴───────────┘ + + >>> agnostic_str_len_chars(df_pa) + pyarrow.Table + words: string + words_len: int32 + ---- + words: [["foo","Café","345","東京",null]] + words_len: [[3,4,3,2,null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.len_chars() + ) + + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> ExprT: + r"""Replace first matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + n: Number of matches to replace. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"foo": ["123abc", "abc abc123"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_replace(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... df = df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) + ... return df.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace`: + + >>> agnostic_str_replace(df_pd) + foo replaced + 0 123abc 123 + 1 abc abc123 abc123 + + >>> agnostic_str_replace(df_pl) + shape: (2, 2) + ┌────────────┬──────────┐ + │ foo ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪══════════╡ + │ 123abc ┆ 123 │ + │ abc abc123 ┆ abc123 │ + └────────────┴──────────┘ + + >>> agnostic_str_replace(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," abc123"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.replace( + pattern, value, literal=literal, n=n + ) + ) + + def replace_all( + self: Self, pattern: str, value: str, *, literal: bool = False + ) -> ExprT: + r"""Replace all matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"foo": ["123abc", "abc abc123"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_replace_all(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... df = df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) + ... return df.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace_all`: + + >>> agnostic_str_replace_all(df_pd) + foo replaced + 0 123abc 123 + 1 abc abc123 123 + + >>> agnostic_str_replace_all(df_pl) + shape: (2, 2) + ┌────────────┬──────────┐ + │ foo ┆ replaced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪══════════╡ + │ 123abc ┆ 123 │ + │ abc abc123 ┆ 123 │ + └────────────┴──────────┘ + + >>> agnostic_str_replace_all(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," 123"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.replace_all( + pattern, value, literal=literal + ) + ) + + def strip_chars(self: Self, characters: str | None = None) -> ExprT: + r"""Remove leading and trailing characters. + + Arguments: + characters: The set of characters to be removed. All combinations of this + set of characters will be stripped from the start and end of the string. + If set to None (default), all leading and trailing whitespace is removed + instead. + + Returns: + A new expression. + + Examples: + >>> from typing import Any + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> + >>> data = {"fruits": ["apple", "\nmango"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_strip_chars(df_native: IntoFrame) -> dict[str, Any]: + ... df = nw.from_native(df_native) + ... df = df.with_columns(stripped=nw.col("fruits").str.strip_chars()) + ... return df.to_dict(as_series=False) + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_strip_chars`: + + >>> agnostic_str_strip_chars(df_pd) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} + + >>> agnostic_str_strip_chars(df_pl) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} + + >>> agnostic_str_strip_chars(df_pa) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.strip_chars(characters) + ) + + def starts_with(self: Self, prefix: str) -> ExprT: + r"""Check if string values start with a substring. + + Arguments: + prefix: prefix substring + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"fruits": ["apple", "mango", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_starts_with(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... has_prefix=nw.col("fruits").str.starts_with("app") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_starts_with`: + + >>> agnostic_str_starts_with(df_pd) + fruits has_prefix + 0 apple True + 1 mango False + 2 None None + + >>> agnostic_str_starts_with(df_pl) + shape: (3, 2) + ┌────────┬────────────┐ + │ fruits ┆ has_prefix │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞════════╪════════════╡ + │ apple ┆ true │ + │ mango ┆ false │ + │ null ┆ null │ + └────────┴────────────┘ + + >>> agnostic_str_starts_with(df_pa) + pyarrow.Table + fruits: string + has_prefix: bool + ---- + fruits: [["apple","mango",null]] + has_prefix: [[true,false,null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix) + ) + + def ends_with(self: Self, suffix: str) -> ExprT: + r"""Check if string values end with a substring. + + Arguments: + suffix: suffix substring + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"fruits": ["apple", "mango", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_ends_with(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... has_suffix=nw.col("fruits").str.ends_with("ngo") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_ends_with`: + + >>> agnostic_str_ends_with(df_pd) + fruits has_suffix + 0 apple False + 1 mango True + 2 None None + + >>> agnostic_str_ends_with(df_pl) + shape: (3, 2) + ┌────────┬────────────┐ + │ fruits ┆ has_suffix │ + │ --- ┆ --- │ + │ str ┆ bool │ + ╞════════╪════════════╡ + │ apple ┆ false │ + │ mango ┆ true │ + │ null ┆ null │ + └────────┴────────────┘ + + >>> agnostic_str_ends_with(df_pa) + pyarrow.Table + fruits: string + has_suffix: bool + ---- + fruits: [["apple","mango",null]] + has_suffix: [[false,true,null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix) + ) + + def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: + r"""Check if string contains a substring that matches a pattern. + + Arguments: + pattern: A Character sequence or valid regular expression pattern. + literal: If True, treats the pattern as a literal string. + If False, assumes the pattern is a regular expression. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_contains(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... default_match=nw.col("pets").str.contains("parrot|Dove"), + ... case_insensitive_match=nw.col("pets").str.contains("(?i)parrot|Dove"), + ... literal_match=nw.col("pets").str.contains( + ... "parrot|Dove", literal=True + ... ), + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_contains`: + + >>> agnostic_str_contains(df_pd) + pets default_match case_insensitive_match literal_match + 0 cat False False False + 1 dog False False False + 2 rabbit and parrot True True False + 3 dove False True False + 4 None None None None + + >>> agnostic_str_contains(df_pl) + shape: (5, 4) + ┌───────────────────┬───────────────┬────────────────────────┬───────────────┐ + │ pets ┆ default_match ┆ case_insensitive_match ┆ literal_match │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ bool ┆ bool ┆ bool │ + ╞═══════════════════╪═══════════════╪════════════════════════╪═══════════════╡ + │ cat ┆ false ┆ false ┆ false │ + │ dog ┆ false ┆ false ┆ false │ + │ rabbit and parrot ┆ true ┆ true ┆ false │ + │ dove ┆ false ┆ true ┆ false │ + │ null ┆ null ┆ null ┆ null │ + └───────────────────┴───────────────┴────────────────────────┴───────────────┘ + + >>> agnostic_str_contains(df_pa) + pyarrow.Table + pets: string + default_match: bool + case_insensitive_match: bool + literal_match: bool + ---- + pets: [["cat","dog","rabbit and parrot","dove",null]] + default_match: [[false,false,true,false,null]] + case_insensitive_match: [[false,false,true,true,null]] + literal_match: [[false,false,false,false,null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.contains( + pattern, literal=literal + ) + ) + + def slice(self: Self, offset: int, length: int | None = None) -> ExprT: + r"""Create subslices of the string values of an expression. + + Arguments: + offset: Start index. Negative indexing is supported. + length: Length of the slice. If set to `None` (default), the slice is taken to the + end of the string. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"s": ["pear", None, "papaya", "dragonfruit"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_slice(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... s_sliced=nw.col("s").str.slice(4, length=3) + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_slice`: + + >>> agnostic_str_slice(df_pd) # doctest: +NORMALIZE_WHITESPACE + s s_sliced + 0 pear + 1 None None + 2 papaya ya + 3 dragonfruit onf + + >>> agnostic_str_slice(df_pl) + shape: (4, 2) + ┌─────────────┬──────────┐ + │ s ┆ s_sliced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════╪══════════╡ + │ pear ┆ │ + │ null ┆ null │ + │ papaya ┆ ya │ + │ dragonfruit ┆ onf │ + └─────────────┴──────────┘ + + >>> agnostic_str_slice(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["",null,"ya","onf"]] + + Using negative indexes: + + >>> def agnostic_str_slice_negative(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns(s_sliced=nw.col("s").str.slice(-3)).to_native() + + >>> agnostic_str_slice_negative(df_pd) + s s_sliced + 0 pear ear + 1 None None + 2 papaya aya + 3 dragonfruit uit + + >>> agnostic_str_slice_negative(df_pl) + shape: (4, 2) + ┌─────────────┬──────────┐ + │ s ┆ s_sliced │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════╪══════════╡ + │ pear ┆ ear │ + │ null ┆ null │ + │ papaya ┆ aya │ + │ dragonfruit ┆ uit │ + └─────────────┴──────────┘ + + >>> agnostic_str_slice_negative(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["ear",null,"aya","uit"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.slice( + offset=offset, length=length + ) + ) + + def head(self: Self, n: int = 5) -> ExprT: + r"""Take the first n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is **not** supported. + + Returns: + A new expression. + + Notes: + If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_head(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... lyrics_head=nw.col("lyrics").str.head() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_head`: + + >>> agnostic_str_head(df_pd) + lyrics lyrics_head + 0 Atatata Atata + 1 taata taata + 2 taatatata taata + 3 zukkyun zukky + + >>> agnostic_str_head(df_pl) + shape: (4, 2) + ┌───────────┬─────────────┐ + │ lyrics ┆ lyrics_head │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═══════════╪═════════════╡ + │ Atatata ┆ Atata │ + │ taata ┆ taata │ + │ taatatata ┆ taata │ + │ zukkyun ┆ zukky │ + └───────────┴─────────────┘ + + >>> agnostic_str_head(df_pa) + pyarrow.Table + lyrics: string + lyrics_head: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_head: [["Atata","taata","taata","zukky"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n) + ) + + def tail(self: Self, n: int = 5) -> ExprT: + r"""Take the last n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is **not** supported. + + Returns: + A new expression. + + Notes: + If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_tail(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... lyrics_tail=nw.col("lyrics").str.tail() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_tail`: + + >>> agnostic_str_tail(df_pd) + lyrics lyrics_tail + 0 Atatata atata + 1 taata taata + 2 taatatata atata + 3 zukkyun kkyun + + >>> agnostic_str_tail(df_pl) + shape: (4, 2) + ┌───────────┬─────────────┐ + │ lyrics ┆ lyrics_tail │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═══════════╪═════════════╡ + │ Atatata ┆ atata │ + │ taata ┆ taata │ + │ taatatata ┆ atata │ + │ zukkyun ┆ kkyun │ + └───────────┴─────────────┘ + + >>> agnostic_str_tail(df_pa) + pyarrow.Table + lyrics: string + lyrics_tail: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_tail: [["atata","taata","atata","kkyun"]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.slice( + offset=-n, length=None + ) + ) + + def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 + """Convert to Datetime dtype. + + Warning: + As different backends auto-infer format in different ways, if `format=None` + there is no guarantee that the result will be equal. + + Arguments: + format: Format to use for conversion. If set to None (default), the format is + inferred from the data. + + Returns: + A new expression. + + Notes: + pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = ["2020-01-01", "2020-01-02"] + >>> df_pd = pd.DataFrame({"a": data}) + >>> df_pl = pl.DataFrame({"a": data}) + >>> df_pa = pa.table({"a": data}) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_to_datetime(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select( + ... nw.col("a").str.to_datetime(format="%Y-%m-%d") + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_datetime`: + + >>> agnostic_str_to_datetime(df_pd) + a + 0 2020-01-01 + 1 2020-01-02 + + >>> agnostic_str_to_datetime(df_pl) + shape: (2, 1) + ┌─────────────────────┐ + │ a │ + │ --- │ + │ datetime[μs] │ + ╞═════════════════════╡ + │ 2020-01-01 00:00:00 │ + │ 2020-01-02 00:00:00 │ + └─────────────────────┘ + + >>> agnostic_str_to_datetime(df_pa) + pyarrow.Table + a: timestamp[us] + ---- + a: [[2020-01-01 00:00:00.000000,2020-01-02 00:00:00.000000]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.to_datetime(format=format) + ) + + def to_uppercase(self: Self) -> ExprT: + r"""Transform string to uppercase variant. + + Returns: + A new expression. + + Notes: + The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'. + For more info see [the related issue](https://github.com/apache/arrow/issues/34599). + There may be other unicode-edge-case-related variations across implementations. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"fruits": ["apple", "mango", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_to_uppercase(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... upper_col=nw.col("fruits").str.to_uppercase() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_uppercase`: + + >>> agnostic_str_to_uppercase(df_pd) + fruits upper_col + 0 apple APPLE + 1 mango MANGO + 2 None None + + >>> agnostic_str_to_uppercase(df_pl) + shape: (3, 2) + ┌────────┬───────────┐ + │ fruits ┆ upper_col │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪═══════════╡ + │ apple ┆ APPLE │ + │ mango ┆ MANGO │ + │ null ┆ null │ + └────────┴───────────┘ + + >>> agnostic_str_to_uppercase(df_pa) + pyarrow.Table + fruits: string + upper_col: string + ---- + fruits: [["apple","mango",null]] + upper_col: [["APPLE","MANGO",null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase() + ) + + def to_lowercase(self: Self) -> ExprT: + r"""Transform string to lowercase variant. + + Returns: + A new expression. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"fruits": ["APPLE", "MANGO", None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a dataframe-agnostic function: + + >>> def agnostic_str_to_lowercase(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... lower_col=nw.col("fruits").str.to_lowercase() + ... ).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_lowercase`: + + >>> agnostic_str_to_lowercase(df_pd) + fruits lower_col + 0 APPLE apple + 1 MANGO mango + 2 None None + + >>> agnostic_str_to_lowercase(df_pl) + shape: (3, 2) + ┌────────┬───────────┐ + │ fruits ┆ lower_col │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════╪═══════════╡ + │ APPLE ┆ apple │ + │ MANGO ┆ mango │ + │ null ┆ null │ + └────────┴───────────┘ + + >>> agnostic_str_to_lowercase(df_pa) + pyarrow.Table + fruits: string + lower_col: string + ---- + fruits: [["APPLE","MANGO",null]] + lower_col: [["apple","mango",null]] + """ + return self._expr.__class__( + lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() + ) diff --git a/narwhals/series.py b/narwhals/series.py index 8385b43ad..46ed53abf 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -8,11 +8,14 @@ from typing import Literal from typing import Mapping from typing import Sequence -from typing import TypeVar from typing import overload from narwhals.dependencies import is_numpy_scalar from narwhals.dtypes import _validate_dtype +from narwhals.series_cat import SeriesCatNamespace +from narwhals.series_dt import SeriesDateTimeNamespace +from narwhals.series_list import SeriesListNamespace +from narwhals.series_str import SeriesStringNamespace from narwhals.typing import IntoSeriesT from narwhals.utils import _validate_rolling_arguments from narwhals.utils import generate_repr @@ -4851,2250 +4854,3 @@ def cat(self: Self) -> SeriesCatNamespace[Self]: @property def list(self: Self) -> SeriesListNamespace[Self]: return SeriesListNamespace(self) - - -SeriesT = TypeVar("SeriesT", bound=Series[Any]) - - -class SeriesCatNamespace(Generic[SeriesT]): - def __init__(self: Self, series: SeriesT) -> None: - self._narwhals_series = series - - def get_categories(self: Self) -> SeriesT: - """Get unique categories from column. - - Returns: - A new Series containing the unique categories. - - Examples: - Let's create some series: - - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["apple", "mango", "mango"] - >>> s_pd = pd.Series(data, dtype="category") - >>> s_pl = pl.Series(data, dtype=pl.Categorical) - >>> s_pa = pa.chunked_array([data]).dictionary_encode() - - We define a dataframe-agnostic function to get unique categories - from column 'fruits': - - >>> def agnostic_get_categories(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.cat.get_categories().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_get_categories`: - - >>> agnostic_get_categories(s_pd) - 0 apple - 1 mango - dtype: object - - >>> agnostic_get_categories(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [str] - [ - "apple" - "mango" - ] - - >>> agnostic_get_categories(s_pa) # doctest: +ELLIPSIS - - [ - [ - "apple", - "mango" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.cat.get_categories() - ) - - -class SeriesStringNamespace(Generic[SeriesT]): - def __init__(self: Self, series: SeriesT) -> None: - self._narwhals_series = series - - def len_chars(self: Self) -> SeriesT: - r"""Return the length of each string as the number of characters. - - Returns: - A new Series containing the length of each string in characters. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["foo", "Café", "345", "東京", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_len_chars(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.len_chars().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_len_chars`: - - >>> agnostic_len_chars(s_pd) - 0 3.0 - 1 4.0 - 2 3.0 - 3 2.0 - 4 NaN - dtype: float64 - - >>> agnostic_len_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (5,) - Series: '' [u32] - [ - 3 - 4 - 3 - 2 - null - ] - - >>> agnostic_len_chars(s_pa) # doctest: +ELLIPSIS - - [ - [ - 3, - 4, - 3, - 2, - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.len_chars() - ) - - def replace( - self: Self, pattern: str, value: str, *, literal: bool = False, n: int = 1 - ) -> SeriesT: - r"""Replace first matching regex/literal substring with a new string value. - - Arguments: - pattern: A valid regular expression pattern. - value: String that will replace the matched substring. - literal: Treat `pattern` as a literal string. - n: Number of matches to replace. - - Returns: - A new Series with the regex/literal pattern replaced with the specified value. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["123abc", "abc abc123"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_replace(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... s = s.str.replace("abc", "") - ... return s.to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_replace`: - - >>> agnostic_replace(s_pd) - 0 123 - 1 abc123 - dtype: object - - >>> agnostic_replace(s_pl) # doctest:+NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [str] - [ - "123" - " abc123" - ] - - >>> agnostic_replace(s_pa) # doctest: +ELLIPSIS - - [ - [ - "123", - " abc123" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.replace( - pattern, value, literal=literal, n=n - ) - ) - - def replace_all( - self: Self, pattern: str, value: str, *, literal: bool = False - ) -> SeriesT: - r"""Replace all matching regex/literal substring with a new string value. - - Arguments: - pattern: A valid regular expression pattern. - value: String that will replace the matched substring. - literal: Treat `pattern` as a literal string. - - Returns: - A new Series with all occurrences of pattern replaced with the specified value. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["123abc", "abc abc123"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_replace_all(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... s = s.str.replace_all("abc", "") - ... return s.to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_replace_all`: - - >>> agnostic_replace_all(s_pd) - 0 123 - 1 123 - dtype: object - - >>> agnostic_replace_all(s_pl) # doctest:+NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [str] - [ - "123" - " 123" - ] - - >>> agnostic_replace_all(s_pa) # doctest: +ELLIPSIS - - [ - [ - "123", - " 123" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.replace_all( - pattern, value, literal=literal - ) - ) - - def strip_chars(self: Self, characters: str | None = None) -> SeriesT: - r"""Remove leading and trailing characters. - - Arguments: - characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead. - - Returns: - A new Series with leading and trailing characters removed. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["apple", "\nmango"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_strip_chars(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... s = s.str.strip_chars() - ... return s.to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_strip_chars`: - - >>> agnostic_strip_chars(s_pd) - 0 apple - 1 mango - dtype: object - - >>> agnostic_strip_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [str] - [ - "apple" - "mango" - ] - - >>> agnostic_strip_chars(s_pa) # doctest: +ELLIPSIS - - [ - [ - "apple", - "mango" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.strip_chars(characters) - ) - - def starts_with(self: Self, prefix: str) -> SeriesT: - r"""Check if string values start with a substring. - - Arguments: - prefix: prefix substring - - Returns: - A new Series with boolean values indicating if each string starts with the prefix. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["apple", "mango", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_starts_with(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.starts_with("app").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_starts_with`: - - >>> agnostic_starts_with(s_pd) - 0 True - 1 False - 2 None - dtype: object - - >>> agnostic_starts_with(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [bool] - [ - true - false - null - ] - - >>> agnostic_starts_with(s_pa) # doctest: +ELLIPSIS - - [ - [ - true, - false, - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.starts_with(prefix) - ) - - def ends_with(self: Self, suffix: str) -> SeriesT: - r"""Check if string values end with a substring. - - Arguments: - suffix: suffix substring - - Returns: - A new Series with boolean values indicating if each string ends with the suffix. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["apple", "mango", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_ends_with(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.ends_with("ngo").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_ends_with`: - - >>> agnostic_ends_with(s_pd) - 0 False - 1 True - 2 None - dtype: object - - >>> agnostic_ends_with(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [bool] - [ - false - true - null - ] - - >>> agnostic_ends_with(s_pa) # doctest: +ELLIPSIS - - [ - [ - false, - true, - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.ends_with(suffix) - ) - - def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: - r"""Check if string contains a substring that matches a pattern. - - Arguments: - pattern: A Character sequence or valid regular expression pattern. - literal: If True, treats the pattern as a literal string. - If False, assumes the pattern is a regular expression. - - Returns: - A new Series with boolean values indicating if each string contains the pattern. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["cat", "dog", "rabbit and parrot", "dove", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_contains(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.contains("parrot|dove").to_native() - - We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: - - >>> agnostic_contains(s_pd) - 0 False - 1 False - 2 True - 3 True - 4 None - dtype: object - - >>> agnostic_contains(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (5,) - Series: '' [bool] - [ - false - false - true - true - null - ] - - >>> agnostic_contains(s_pa) # doctest: +ELLIPSIS - - [ - [ - false, - false, - true, - true, - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.contains(pattern, literal=literal) - ) - - def slice(self: Self, offset: int, length: int | None = None) -> SeriesT: - r"""Create subslices of the string values of a Series. - - Arguments: - offset: Start index. Negative indexing is supported. - length: Length of the slice. If set to `None` (default), the slice is taken to the - end of the string. - - Returns: - A new Series containing subslices of each string. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["pear", None, "papaya", "dragonfruit"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.slice(4, length=3).to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_slice`: - - >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE - 0 - 1 None - 2 ya - 3 onf - dtype: object - - >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: '' [str] - [ - "" - null - "ya" - "onf" - ] - - >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS - - [ - [ - "", - null, - "ya", - "onf" - ] - ] - - Using negative indexes: - - >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.slice(-3).to_native() - - >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE - 0 ear - 1 None - 2 aya - 3 uit - dtype: object - - >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: '' [str] - [ - "ear" - null - "aya" - "uit" - ] - - >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS - - [ - [ - "ear", - null, - "aya", - "uit" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.slice( - offset=offset, length=length - ) - ) - - def head(self: Self, n: int = 5) -> SeriesT: - r"""Take the first n elements of each string. - - Arguments: - n: Number of elements to take. Negative indexing is supported (see note (1.)) - - Returns: - A new Series containing the first n characters of each string. - - Notes: - 1. When the `n` input is negative, `head` returns characters up to the n-th from the end of the string. - For example, if `n = -3`, then all characters except the last three are returned. - 2. If the length of the string has fewer than `n` characters, the full string is returned. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_head(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.head().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_head`: - - >>> agnostic_head(s_pd) - 0 Atata - 1 taata - 2 taata - 3 zukky - dtype: object - - >>> agnostic_head(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: '' [str] - [ - "Atata" - "taata" - "taata" - "zukky" - ] - - >>> agnostic_head(s_pa) # doctest: +ELLIPSIS - - [ - [ - "Atata", - "taata", - "taata", - "zukky" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.slice(offset=0, length=n) - ) - - def tail(self: Self, n: int = 5) -> SeriesT: - r"""Take the last n elements of each string. - - Arguments: - n: Number of elements to take. Negative indexing is supported (see note (1.)) - - Returns: - A new Series containing the last n characters of each string. - - Notes: - 1. When the `n` input is negative, `tail` returns characters starting from the n-th from the beginning of - the string. For example, if `n = -3`, then all characters except the first three are returned. - 2. If the length of the string has fewer than `n` characters, the full string is returned. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_tail(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.tail().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_tail`: - - >>> agnostic_tail(s_pd) - 0 atata - 1 taata - 2 atata - 3 kkyun - dtype: object - - >>> agnostic_tail(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: '' [str] - [ - "atata" - "taata" - "atata" - "kkyun" - ] - - >>> agnostic_tail(s_pa) # doctest: +ELLIPSIS - - [ - [ - "atata", - "taata", - "atata", - "kkyun" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.slice(offset=-n, length=None) - ) - - def to_uppercase(self) -> SeriesT: - r"""Transform string to uppercase variant. - - Returns: - A new Series with values converted to uppercase. - - Notes: - The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'. - For more info see: https://github.com/apache/arrow/issues/34599 - There may be other unicode-edge-case-related variations across implementations. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["apple", "mango", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_to_uppercase(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.to_uppercase().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_to_uppercase`: - - >>> agnostic_to_uppercase(s_pd) - 0 APPLE - 1 MANGO - 2 None - dtype: object - - >>> agnostic_to_uppercase(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [str] - [ - "APPLE" - "MANGO" - null - ] - - >>> agnostic_to_uppercase(s_pa) # doctest: +ELLIPSIS - - [ - [ - "APPLE", - "MANGO", - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.to_uppercase() - ) - - def to_lowercase(self) -> SeriesT: - r"""Transform string to lowercase variant. - - Returns: - A new Series with values converted to lowercase. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["APPLE", "MANGO", None] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_to_lowercase(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.to_lowercase().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_to_lowercase`: - - >>> agnostic_to_lowercase(s_pd) - 0 apple - 1 mango - 2 None - dtype: object - - >>> agnostic_to_lowercase(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [str] - [ - "apple" - "mango" - null - ] - - >>> agnostic_to_lowercase(s_pa) # doctest: +ELLIPSIS - - [ - [ - "apple", - "mango", - null - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.to_lowercase() - ) - - def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002 - """Parse Series with strings to a Series with Datetime dtype. - - Notes: - pandas defaults to nanosecond time unit, Polars to microsecond. - Prior to pandas 2.0, nanoseconds were the only time unit supported - in pandas, with no ability to set any other one. The ability to - set the time unit in pandas, if the version permits, will arrive. - - Warning: - As different backends auto-infer format in different ways, if `format=None` - there is no guarantee that the result will be equal. - - Arguments: - format: Format to use for conversion. If set to None (default), the format is - inferred from the data. - - Returns: - A new Series with datetime dtype. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["2020-01-01", "2020-01-02"] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_to_datetime(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.str.to_datetime(format="%Y-%m-%d").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_to_datetime`: - - >>> agnostic_to_datetime(s_pd) - 0 2020-01-01 - 1 2020-01-02 - dtype: datetime64[ns] - - >>> agnostic_to_datetime(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [datetime[μs]] - [ - 2020-01-01 00:00:00 - 2020-01-02 00:00:00 - ] - - >>> agnostic_to_datetime(s_pa) # doctest: +ELLIPSIS - - [ - [ - 2020-01-01 00:00:00.000000, - 2020-01-02 00:00:00.000000 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.str.to_datetime(format=format) - ) - - -class SeriesDateTimeNamespace(Generic[SeriesT]): - def __init__(self: Self, series: SeriesT) -> None: - self._narwhals_series = series - - def date(self: Self) -> SeriesT: - """Get the date in a datetime series. - - Returns: - A new Series with the date portion of the datetime values. - - Raises: - NotImplementedError: If pandas default backend is being used. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)] - >>> s_pd = pd.Series(dates).convert_dtypes(dtype_backend="pyarrow") - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_date(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.date().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_date`: - - >>> agnostic_date(s_pd) - 0 2012-01-07 - 1 2023-03-10 - dtype: date32[day][pyarrow] - - >>> agnostic_date(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [date] - [ - 2012-01-07 - 2023-03-10 - ] - - >>> agnostic_date(s_pa) # doctest: +ELLIPSIS - - [ - [ - 2012-01-07, - 2023-03-10 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.date() - ) - - def year(self: Self) -> SeriesT: - """Get the year in a datetime series. - - Returns: - A new Series containing the year component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2012, 1, 7), datetime(2023, 3, 10)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_year(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.year().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_year`: - - >>> agnostic_year(s_pd) - 0 2012 - 1 2023 - dtype: int... - - >>> agnostic_year(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i32] - [ - 2012 - 2023 - ] - - >>> agnostic_year(s_pa) # doctest: +ELLIPSIS - - [ - [ - 2012, - 2023 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.year() - ) - - def month(self: Self) -> SeriesT: - """Gets the month in a datetime series. - - Returns: - A new Series containing the month component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2023, 2, 1), datetime(2023, 8, 3)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_month(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.month().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_month`: - - >>> agnostic_month(s_pd) - 0 2 - 1 8 - dtype: int... - >>> agnostic_month(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 2 - 8 - ] - - >>> agnostic_month(s_pa) # doctest: +ELLIPSIS - - [ - [ - 2, - 8 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.month() - ) - - def day(self: Self) -> SeriesT: - """Extracts the day in a datetime series. - - Returns: - A new Series containing the day component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2022, 1, 1), datetime(2022, 1, 5)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_day(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.day().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_day`: - - >>> agnostic_day(s_pd) - 0 1 - 1 5 - dtype: int... - - >>> agnostic_day(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 1 - 5 - ] - - >>> agnostic_day(s_pa) # doctest: +ELLIPSIS - - [ - [ - 1, - 5 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.day() - ) - - def hour(self: Self) -> SeriesT: - """Extracts the hour in a datetime series. - - Returns: - A new Series containing the hour component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_hour(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.hour().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_hour`: - - >>> agnostic_hour(s_pd) - 0 5 - 1 9 - dtype: int... - - >>> agnostic_hour(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 5 - 9 - ] - - >>> agnostic_hour(s_pa) # doctest: +ELLIPSIS - - [ - [ - 5, - 9 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.hour() - ) - - def minute(self: Self) -> SeriesT: - """Extracts the minute in a datetime series. - - Returns: - A new Series containing the minute component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_minute(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.minute().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_minute`: - - >>> agnostic_minute(s_pd) - 0 3 - 1 12 - dtype: int... - - >>> agnostic_minute(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 3 - 12 - ] - - >>> agnostic_minute(s_pa) # doctest: +ELLIPSIS - - [ - [ - 3, - 12 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.minute() - ) - - def second(self: Self) -> SeriesT: - """Extracts the seconds in a datetime series. - - Returns: - A new Series containing the second component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [datetime(2022, 1, 1, 5, 3, 10), datetime(2022, 1, 5, 9, 12, 4)] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_second(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.second().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_second`: - - >>> agnostic_second(s_pd) - 0 10 - 1 4 - dtype: int... - - >>> agnostic_second(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 10 - 4 - ] - - >>> agnostic_second(s_pa) # doctest: +ELLIPSIS - - [ - [ - 10, - 4 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.second() - ) - - def millisecond(self: Self) -> SeriesT: - """Extracts the milliseconds in a datetime series. - - Returns: - A new Series containing the millisecond component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [ - ... datetime(2023, 5, 21, 12, 55, 10, 400000), - ... datetime(2023, 5, 21, 12, 55, 10, 600000), - ... datetime(2023, 5, 21, 12, 55, 10, 800000), - ... datetime(2023, 5, 21, 12, 55, 11, 0), - ... datetime(2023, 5, 21, 12, 55, 11, 200000), - ... ] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_millisecond(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.millisecond().alias("datetime").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_millisecond`: - - >>> agnostic_millisecond(s_pd) - 0 400 - 1 600 - 2 800 - 3 0 - 4 200 - Name: datetime, dtype: int... - - >>> agnostic_millisecond(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (5,) - Series: 'datetime' [i32] - [ - 400 - 600 - 800 - 0 - 200 - ] - - >>> agnostic_millisecond(s_pa) # doctest: +ELLIPSIS - - [ - [ - 400, - 600, - 800, - 0, - 200 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.millisecond() - ) - - def microsecond(self: Self) -> SeriesT: - """Extracts the microseconds in a datetime series. - - Returns: - A new Series containing the microsecond component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [ - ... datetime(2023, 5, 21, 12, 55, 10, 400000), - ... datetime(2023, 5, 21, 12, 55, 10, 600000), - ... datetime(2023, 5, 21, 12, 55, 10, 800000), - ... datetime(2023, 5, 21, 12, 55, 11, 0), - ... datetime(2023, 5, 21, 12, 55, 11, 200000), - ... ] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_microsecond(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.microsecond().alias("datetime").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_microsecond`: - - >>> agnostic_microsecond(s_pd) - 0 400000 - 1 600000 - 2 800000 - 3 0 - 4 200000 - Name: datetime, dtype: int... - - >>> agnostic_microsecond(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (5,) - Series: 'datetime' [i32] - [ - 400000 - 600000 - 800000 - 0 - 200000 - ] - - >>> agnostic_microsecond(s_pa) # doctest: +ELLIPSIS - - [ - [ - 400000, - 600000, - 800000, - 0, - 200000 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.microsecond() - ) - - def nanosecond(self: Self) -> SeriesT: - """Extract the nanoseconds in a date series. - - Returns: - A new Series containing the nanosecond component of each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> dates = [ - ... datetime(2022, 1, 1, 5, 3, 10, 500000), - ... datetime(2022, 1, 5, 9, 12, 4, 60000), - ... ] - >>> s_pd = pd.Series(dates) - >>> s_pl = pl.Series(dates) - >>> s_pa = pa.chunked_array([dates]) - - We define a library agnostic function: - - >>> def agnostic_nanosecond(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.nanosecond().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_nanosecond`: - - >>> agnostic_nanosecond(s_pd) - 0 500000000 - 1 60000000 - dtype: int... - - >>> agnostic_nanosecond(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i32] - [ - 500000000 - 60000000 - ] - - >>> agnostic_nanosecond(s_pa) # doctest: +ELLIPSIS - - [ - [ - 500000000, - 60000000 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.nanosecond() - ) - - def ordinal_day(self: Self) -> SeriesT: - """Get ordinal day. - - Returns: - A new Series containing the ordinal day (day of year) for each datetime value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [datetime(2020, 1, 1), datetime(2020, 8, 3)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_ordinal_day(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.ordinal_day().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_ordinal_day`: - - >>> agnostic_ordinal_day(s_pd) - 0 1 - 1 216 - dtype: int32 - - >>> agnostic_ordinal_day(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i16] - [ - 1 - 216 - ] - - - >>> agnostic_ordinal_day(s_pa) # doctest: +ELLIPSIS - - [ - [ - 1, - 216 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.ordinal_day() - ) - - def weekday(self: Self) -> SeriesT: - """Extract the week day in a datetime series. - - Returns: - A new Series containing the week day for each datetime value. - Returns the ISO weekday number where monday = 1 and sunday = 7 - - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - >>> data = [datetime(2020, 1, 1), datetime(2020, 8, 3)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_weekday(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.weekday().to_native() - - We can then pass either pandas, Polars, PyArrow, and other supported libraries to `agnostic_weekday`: - - >>> agnostic_weekday(s_pd) - 0 3 - 1 1 - dtype: int32 - >>> agnostic_weekday(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i8] - [ - 3 - 1 - ] - >>> agnostic_weekday(s_pa) # doctest: +ELLIPSIS - - [ - [ - 3, - 1 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.weekday() - ) - - def total_minutes(self: Self) -> SeriesT: - """Get total minutes. - - Notes: - The function outputs the total minutes in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` in this case. - - Returns: - A new Series containing the total number of minutes for each timedelta value. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [timedelta(minutes=10), timedelta(minutes=20, seconds=40)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_total_minutes(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.total_minutes().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_total_minutes`: - - >>> agnostic_total_minutes(s_pd) - 0 10 - 1 20 - dtype: int... - - >>> agnostic_total_minutes(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i64] - [ - 10 - 20 - ] - - >>> agnostic_total_minutes(s_pa) # doctest: +ELLIPSIS - - [ - [ - 10, - 20 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.total_minutes() - ) - - def total_seconds(self: Self) -> SeriesT: - """Get total seconds. - - Notes: - The function outputs the total seconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` in this case. - - Returns: - A new Series containing the total number of seconds for each timedelta value. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_total_seconds(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.total_seconds().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_total_seconds`: - - >>> agnostic_total_seconds(s_pd) - 0 10 - 1 20 - dtype: int... - - >>> agnostic_total_seconds(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i64] - [ - 10 - 20 - ] - - >>> agnostic_total_seconds(s_pa) # doctest: +ELLIPSIS - - [ - [ - 10, - 20 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.total_seconds() - ) - - def total_milliseconds(self: Self) -> SeriesT: - """Get total milliseconds. - - Notes: - The function outputs the total milliseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` in this case. - - Returns: - A new Series containing the total number of milliseconds for each timedelta value. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [ - ... timedelta(milliseconds=10), - ... timedelta(milliseconds=20, microseconds=40), - ... ] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_total_milliseconds(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.total_milliseconds().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_total_milliseconds`: - - >>> agnostic_total_milliseconds(s_pd) - 0 10 - 1 20 - dtype: int... - - >>> agnostic_total_milliseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i64] - [ - 10 - 20 - ] - - >>> agnostic_total_milliseconds(s_pa) # doctest: +ELLIPSIS - - [ - [ - 10, - 20 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.total_milliseconds() - ) - - def total_microseconds(self: Self) -> SeriesT: - """Get total microseconds. - - Returns: - A new Series containing the total number of microseconds for each timedelta value. - - Notes: - The function outputs the total microseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` in this case. - - Examples: - >>> from datetime import timedelta - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [ - ... timedelta(microseconds=10), - ... timedelta(milliseconds=1, microseconds=200), - ... ] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a library agnostic function: - - >>> def agnostic_total_microseconds(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.total_microseconds().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_total_microseconds`: - - >>> agnostic_total_microseconds(s_pd) - 0 10 - 1 1200 - dtype: int... - - >>> agnostic_total_microseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i64] - [ - 10 - 1200 - ] - - >>> agnostic_total_microseconds(s_pa) # doctest: +ELLIPSIS - - [ - [ - 10, - 1200 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.total_microseconds() - ) - - def total_nanoseconds(self: Self) -> SeriesT: - """Get total nanoseconds. - - Notes: - The function outputs the total nanoseconds in the int dtype by default, - however, pandas may change the dtype to float when there are missing values, - consider using `fill_null()` in this case. - - Returns: - A new Series containing the total number of nanoseconds for each timedelta value. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] - >>> s_pd = pd.to_datetime(pd.Series(data)) - >>> s_pl = pl.Series(data).str.to_datetime(time_unit="ns") - - We define a library agnostic function: - - >>> def agnostic_total_nanoseconds(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.diff().dt.total_nanoseconds().to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_total_nanoseconds`: - - >>> agnostic_total_nanoseconds(s_pd) - 0 NaN - 1 1.0 - dtype: float64 - - >>> agnostic_total_nanoseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [i64] - [ - null - 1 - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.total_nanoseconds() - ) - - def to_string(self: Self, format: str) -> SeriesT: # noqa: A002 - """Convert a Date/Time/Datetime series into a String series with the given format. - - Arguments: - format: Format string for converting the datetime to string. - - Returns: - A new Series with the datetime values formatted as strings according to the specified format. - - Notes: - Unfortunately, different libraries interpret format directives a bit - differently. - - - Chrono, the library used by Polars, uses `"%.f"` for fractional seconds, - whereas pandas and Python stdlib use `".%f"`. - - PyArrow interprets `"%S"` as "seconds, including fractional seconds" - whereas most other tools interpret it as "just seconds, as 2 digits". - - Therefore, we make the following adjustments: - - - for pandas-like libraries, we replace `"%S.%f"` with `"%S%.f"`. - - for PyArrow, we replace `"%S.%f"` with `"%S"`. - - Workarounds like these don't make us happy, and we try to avoid them as - much as possible, but here we feel like it's the best compromise. - - If you just want to format a date/datetime Series as a local datetime - string, and have it work as consistently as possible across libraries, - we suggest using: - - - `"%Y-%m-%dT%H:%M:%S%.f"` for datetimes - - `"%Y-%m-%d"` for dates - - though note that, even then, different tools may return a different number - of trailing zeros. Nonetheless, this is probably consistent enough for - most applications. - - If you have an application where this is not enough, please open an issue - and let us know. - - Examples: - >>> from datetime import datetime - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - We define a dataframe-agnostic function: - - >>> def agnostic_to_string(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.to_string("%Y/%m/%d").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_to_string`: - - >>> agnostic_to_string(s_pd) - 0 2020/03/01 - 1 2020/04/01 - 2 2020/05/01 - dtype: object - - >>> agnostic_to_string(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [str] - [ - "2020/03/01" - "2020/04/01" - "2020/05/01" - ] - - >>> agnostic_to_string(s_pa) # doctest: +ELLIPSIS - - [ - [ - "2020/03/01", - "2020/04/01", - "2020/05/01" - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.to_string(format) - ) - - def replace_time_zone(self: Self, time_zone: str | None) -> SeriesT: - """Replace time zone. - - Arguments: - time_zone: Target time zone. - - Returns: - A new Series with the specified time zone. - - Examples: - >>> from datetime import datetime, timezone - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [ - ... datetime(2024, 1, 1, tzinfo=timezone.utc), - ... datetime(2024, 1, 2, tzinfo=timezone.utc), - ... ] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_replace_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.replace_time_zone("Asia/Kathmandu").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_replace_time_zone`: - - >>> agnostic_replace_time_zone(s_pd) - 0 2024-01-01 00:00:00+05:45 - 1 2024-01-02 00:00:00+05:45 - dtype: datetime64[ns, Asia/Kathmandu] - - >>> agnostic_replace_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [datetime[μs, Asia/Kathmandu]] - [ - 2024-01-01 00:00:00 +0545 - 2024-01-02 00:00:00 +0545 - ] - - >>> agnostic_replace_time_zone(s_pa) - - [ - [ - 2023-12-31 18:15:00.000000Z, - 2024-01-01 18:15:00.000000Z - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.replace_time_zone(time_zone) - ) - - def convert_time_zone(self: Self, time_zone: str) -> SeriesT: - """Convert time zone. - - If converting from a time-zone-naive column, then conversion happens - as if converting from UTC. - - Arguments: - time_zone: Target time zone. - - Returns: - A new Series with the specified time zone. - - Examples: - >>> from datetime import datetime, timezone - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [ - ... datetime(2024, 1, 1, tzinfo=timezone.utc), - ... datetime(2024, 1, 2, tzinfo=timezone.utc), - ... ] - >>> s_pd = pd.Series(data) - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_convert_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.convert_time_zone("Asia/Kathmandu").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_convert_time_zone`: - - >>> agnostic_convert_time_zone(s_pd) - 0 2024-01-01 05:45:00+05:45 - 1 2024-01-02 05:45:00+05:45 - dtype: datetime64[ns, Asia/Kathmandu] - - >>> agnostic_convert_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (2,) - Series: '' [datetime[μs, Asia/Kathmandu]] - [ - 2024-01-01 05:45:00 +0545 - 2024-01-02 05:45:00 +0545 - ] - - >>> agnostic_convert_time_zone(s_pa) - - [ - [ - 2024-01-01 00:00:00.000000Z, - 2024-01-02 00:00:00.000000Z - ] - ] - """ - if time_zone is None: - msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." - raise TypeError(msg) - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) - ) - - def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> SeriesT: - """Return a timestamp in the given time unit. - - Arguments: - time_unit: {'ns', 'us', 'ms'} - Time unit. - - Returns: - A new Series with timestamps in the specified time unit. - - Examples: - >>> from datetime import date - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [date(2001, 1, 1), None, date(2001, 1, 3)] - >>> s_pd = pd.Series(data, dtype="datetime64[ns]") - >>> s_pl = pl.Series(data) - >>> s_pa = pa.chunked_array([data]) - - Let's define a dataframe-agnostic function: - - >>> def agnostic_timestamp(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.dt.timestamp("ms").to_native() - - We can then pass any supported library such as pandas, Polars, or - PyArrow to `agnostic_timestamp`: - - >>> agnostic_timestamp(s_pd) - 0 9.783072e+11 - 1 NaN - 2 9.784800e+11 - dtype: float64 - - >>> agnostic_timestamp(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3,) - Series: '' [i64] - [ - 978307200000 - null - 978480000000 - ] - - >>> agnostic_timestamp(s_pa) - - [ - [ - 978307200000, - null, - 978480000000 - ] - ] - """ - if time_unit not in {"ns", "us", "ms"}: - msg = ( - "invalid `time_unit`" - f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." - ) - raise ValueError(msg) - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.dt.timestamp(time_unit) - ) - - -class SeriesListNamespace(Generic[SeriesT]): - def __init__(self: Self, series: SeriesT) -> None: - self._narwhals_series = series - - def len(self: Self) -> SeriesT: - """Return the number of elements in each list. - - Null values count towards the total. - - Returns: - A new series. - - Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import pyarrow as pa - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - - >>> data = [[1, 2], [3, 4, None], None, []] - - Let's define a dataframe-agnostic function: - - >>> def agnostic_list_len(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.list.len().to_native() - - We can then pass pandas / PyArrow / Polars / any other supported library: - - >>> agnostic_list_len( - ... pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - ... ) # doctest: +SKIP - 0 2 - 1 3 - 2 - 3 0 - dtype: int32[pyarrow] - - >>> agnostic_list_len(pl.Series(data)) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: '' [u32] - [ - 2 - 3 - null - 0 - ] - - >>> agnostic_list_len(pa.chunked_array([data])) # doctest: +ELLIPSIS - - [ - [ - 2, - 3, - null, - 0 - ] - ] - """ - return self._narwhals_series._from_compliant_series( - self._narwhals_series._compliant_series.list.len() - ) diff --git a/narwhals/series_cat.py b/narwhals/series_cat.py new file mode 100644 index 000000000..73f899d13 --- /dev/null +++ b/narwhals/series_cat.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.series import Series + +SeriesT = TypeVar("SeriesT", bound="Series[Any]") + + +class SeriesCatNamespace(Generic[SeriesT]): + def __init__(self: Self, series: SeriesT) -> None: + self._narwhals_series = series + + def get_categories(self: Self) -> SeriesT: + """Get unique categories from column. + + Returns: + A new Series containing the unique categories. + + Examples: + Let's create some series: + + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "mango", "mango"] + >>> s_pd = pd.Series(data, dtype="category") + >>> s_pl = pl.Series(data, dtype=pl.Categorical) + >>> s_pa = pa.chunked_array([data]).dictionary_encode() + + We define a dataframe-agnostic function to get unique categories + from column 'fruits': + + >>> def agnostic_get_categories(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.cat.get_categories().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_get_categories`: + + >>> agnostic_get_categories(s_pd) + 0 apple + 1 mango + dtype: object + + >>> agnostic_get_categories(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [str] + [ + "apple" + "mango" + ] + + >>> agnostic_get_categories(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.cat.get_categories() + ) diff --git a/narwhals/series_dt.py b/narwhals/series_dt.py new file mode 100644 index 000000000..5fea4ff5c --- /dev/null +++ b/narwhals/series_dt.py @@ -0,0 +1,1280 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Generic +from typing import Literal +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.series import Series + +SeriesT = TypeVar("SeriesT", bound="Series[Any]") + + +class SeriesDateTimeNamespace(Generic[SeriesT]): + def __init__(self: Self, series: SeriesT) -> None: + self._narwhals_series = series + + def date(self: Self) -> SeriesT: + """Get the date in a datetime series. + + Returns: + A new Series with the date portion of the datetime values. + + Raises: + NotImplementedError: If pandas default backend is being used. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)] + >>> s_pd = pd.Series(dates).convert_dtypes(dtype_backend="pyarrow") + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_date(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.date().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_date`: + + >>> agnostic_date(s_pd) + 0 2012-01-07 + 1 2023-03-10 + dtype: date32[day][pyarrow] + + >>> agnostic_date(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [date] + [ + 2012-01-07 + 2023-03-10 + ] + + >>> agnostic_date(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2012-01-07, + 2023-03-10 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.date() + ) + + def year(self: Self) -> SeriesT: + """Get the year in a datetime series. + + Returns: + A new Series containing the year component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2012, 1, 7), datetime(2023, 3, 10)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_year(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.year().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_year`: + + >>> agnostic_year(s_pd) + 0 2012 + 1 2023 + dtype: int... + + >>> agnostic_year(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i32] + [ + 2012 + 2023 + ] + + >>> agnostic_year(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2012, + 2023 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.year() + ) + + def month(self: Self) -> SeriesT: + """Gets the month in a datetime series. + + Returns: + A new Series containing the month component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2023, 2, 1), datetime(2023, 8, 3)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_month(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.month().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_month`: + + >>> agnostic_month(s_pd) + 0 2 + 1 8 + dtype: int... + >>> agnostic_month(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 2 + 8 + ] + + >>> agnostic_month(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 8 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.month() + ) + + def day(self: Self) -> SeriesT: + """Extracts the day in a datetime series. + + Returns: + A new Series containing the day component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2022, 1, 1), datetime(2022, 1, 5)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_day(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.day().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_day`: + + >>> agnostic_day(s_pd) + 0 1 + 1 5 + dtype: int... + + >>> agnostic_day(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 1 + 5 + ] + + >>> agnostic_day(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 5 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.day() + ) + + def hour(self: Self) -> SeriesT: + """Extracts the hour in a datetime series. + + Returns: + A new Series containing the hour component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_hour(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.hour().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_hour`: + + >>> agnostic_hour(s_pd) + 0 5 + 1 9 + dtype: int... + + >>> agnostic_hour(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 5 + 9 + ] + + >>> agnostic_hour(s_pa) # doctest: +ELLIPSIS + + [ + [ + 5, + 9 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.hour() + ) + + def minute(self: Self) -> SeriesT: + """Extracts the minute in a datetime series. + + Returns: + A new Series containing the minute component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_minute(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.minute().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_minute`: + + >>> agnostic_minute(s_pd) + 0 3 + 1 12 + dtype: int... + + >>> agnostic_minute(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 3 + 12 + ] + + >>> agnostic_minute(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 12 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.minute() + ) + + def second(self: Self) -> SeriesT: + """Extracts the seconds in a datetime series. + + Returns: + A new Series containing the second component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [datetime(2022, 1, 1, 5, 3, 10), datetime(2022, 1, 5, 9, 12, 4)] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_second(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.second().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_second`: + + >>> agnostic_second(s_pd) + 0 10 + 1 4 + dtype: int... + + >>> agnostic_second(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 10 + 4 + ] + + >>> agnostic_second(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 4 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.second() + ) + + def millisecond(self: Self) -> SeriesT: + """Extracts the milliseconds in a datetime series. + + Returns: + A new Series containing the millisecond component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [ + ... datetime(2023, 5, 21, 12, 55, 10, 400000), + ... datetime(2023, 5, 21, 12, 55, 10, 600000), + ... datetime(2023, 5, 21, 12, 55, 10, 800000), + ... datetime(2023, 5, 21, 12, 55, 11, 0), + ... datetime(2023, 5, 21, 12, 55, 11, 200000), + ... ] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_millisecond(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.millisecond().alias("datetime").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_millisecond`: + + >>> agnostic_millisecond(s_pd) + 0 400 + 1 600 + 2 800 + 3 0 + 4 200 + Name: datetime, dtype: int... + + >>> agnostic_millisecond(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: 'datetime' [i32] + [ + 400 + 600 + 800 + 0 + 200 + ] + + >>> agnostic_millisecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 400, + 600, + 800, + 0, + 200 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.millisecond() + ) + + def microsecond(self: Self) -> SeriesT: + """Extracts the microseconds in a datetime series. + + Returns: + A new Series containing the microsecond component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [ + ... datetime(2023, 5, 21, 12, 55, 10, 400000), + ... datetime(2023, 5, 21, 12, 55, 10, 600000), + ... datetime(2023, 5, 21, 12, 55, 10, 800000), + ... datetime(2023, 5, 21, 12, 55, 11, 0), + ... datetime(2023, 5, 21, 12, 55, 11, 200000), + ... ] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_microsecond(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.microsecond().alias("datetime").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_microsecond`: + + >>> agnostic_microsecond(s_pd) + 0 400000 + 1 600000 + 2 800000 + 3 0 + 4 200000 + Name: datetime, dtype: int... + + >>> agnostic_microsecond(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: 'datetime' [i32] + [ + 400000 + 600000 + 800000 + 0 + 200000 + ] + + >>> agnostic_microsecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 400000, + 600000, + 800000, + 0, + 200000 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.microsecond() + ) + + def nanosecond(self: Self) -> SeriesT: + """Extract the nanoseconds in a date series. + + Returns: + A new Series containing the nanosecond component of each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> dates = [ + ... datetime(2022, 1, 1, 5, 3, 10, 500000), + ... datetime(2022, 1, 5, 9, 12, 4, 60000), + ... ] + >>> s_pd = pd.Series(dates) + >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) + + We define a library agnostic function: + + >>> def agnostic_nanosecond(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.nanosecond().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_nanosecond`: + + >>> agnostic_nanosecond(s_pd) + 0 500000000 + 1 60000000 + dtype: int... + + >>> agnostic_nanosecond(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i32] + [ + 500000000 + 60000000 + ] + + >>> agnostic_nanosecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 500000000, + 60000000 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.nanosecond() + ) + + def ordinal_day(self: Self) -> SeriesT: + """Get ordinal day. + + Returns: + A new Series containing the ordinal day (day of year) for each datetime value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [datetime(2020, 1, 1), datetime(2020, 8, 3)] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_ordinal_day(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.ordinal_day().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_ordinal_day`: + + >>> agnostic_ordinal_day(s_pd) + 0 1 + 1 216 + dtype: int32 + + >>> agnostic_ordinal_day(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i16] + [ + 1 + 216 + ] + + + >>> agnostic_ordinal_day(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 216 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.ordinal_day() + ) + + def weekday(self: Self) -> SeriesT: + """Extract the week day in a datetime series. + + Returns: + A new Series containing the week day for each datetime value. + Returns the ISO weekday number where monday = 1 and sunday = 7 + + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [datetime(2020, 1, 1), datetime(2020, 8, 3)] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_weekday(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.weekday().to_native() + + We can then pass either pandas, Polars, PyArrow, and other supported libraries to `agnostic_weekday`: + + >>> agnostic_weekday(s_pd) + 0 3 + 1 1 + dtype: int32 + >>> agnostic_weekday(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i8] + [ + 3 + 1 + ] + >>> agnostic_weekday(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 1 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.weekday() + ) + + def total_minutes(self: Self) -> SeriesT: + """Get total minutes. + + Notes: + The function outputs the total minutes in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` in this case. + + Returns: + A new Series containing the total number of minutes for each timedelta value. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [timedelta(minutes=10), timedelta(minutes=20, seconds=40)] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_total_minutes(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.total_minutes().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_minutes`: + + >>> agnostic_total_minutes(s_pd) + 0 10 + 1 20 + dtype: int... + + >>> agnostic_total_minutes(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i64] + [ + 10 + 20 + ] + + >>> agnostic_total_minutes(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.total_minutes() + ) + + def total_seconds(self: Self) -> SeriesT: + """Get total seconds. + + Notes: + The function outputs the total seconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` in this case. + + Returns: + A new Series containing the total number of seconds for each timedelta value. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_total_seconds(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.total_seconds().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_seconds`: + + >>> agnostic_total_seconds(s_pd) + 0 10 + 1 20 + dtype: int... + + >>> agnostic_total_seconds(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i64] + [ + 10 + 20 + ] + + >>> agnostic_total_seconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.total_seconds() + ) + + def total_milliseconds(self: Self) -> SeriesT: + """Get total milliseconds. + + Notes: + The function outputs the total milliseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` in this case. + + Returns: + A new Series containing the total number of milliseconds for each timedelta value. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [ + ... timedelta(milliseconds=10), + ... timedelta(milliseconds=20, microseconds=40), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_total_milliseconds(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.total_milliseconds().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_milliseconds`: + + >>> agnostic_total_milliseconds(s_pd) + 0 10 + 1 20 + dtype: int... + + >>> agnostic_total_milliseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i64] + [ + 10 + 20 + ] + + >>> agnostic_total_milliseconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.total_milliseconds() + ) + + def total_microseconds(self: Self) -> SeriesT: + """Get total microseconds. + + Returns: + A new Series containing the total number of microseconds for each timedelta value. + + Notes: + The function outputs the total microseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` in this case. + + Examples: + >>> from datetime import timedelta + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [ + ... timedelta(microseconds=10), + ... timedelta(milliseconds=1, microseconds=200), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_total_microseconds(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.total_microseconds().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_microseconds`: + + >>> agnostic_total_microseconds(s_pd) + 0 10 + 1 1200 + dtype: int... + + >>> agnostic_total_microseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i64] + [ + 10 + 1200 + ] + + >>> agnostic_total_microseconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 1200 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.total_microseconds() + ) + + def total_nanoseconds(self: Self) -> SeriesT: + """Get total nanoseconds. + + Notes: + The function outputs the total nanoseconds in the int dtype by default, + however, pandas may change the dtype to float when there are missing values, + consider using `fill_null()` in this case. + + Returns: + A new Series containing the total number of nanoseconds for each timedelta value. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] + >>> s_pd = pd.to_datetime(pd.Series(data)) + >>> s_pl = pl.Series(data).str.to_datetime(time_unit="ns") + + We define a library agnostic function: + + >>> def agnostic_total_nanoseconds(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.diff().dt.total_nanoseconds().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_nanoseconds`: + + >>> agnostic_total_nanoseconds(s_pd) + 0 NaN + 1 1.0 + dtype: float64 + + >>> agnostic_total_nanoseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [i64] + [ + null + 1 + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.total_nanoseconds() + ) + + def to_string(self: Self, format: str) -> SeriesT: # noqa: A002 + """Convert a Date/Time/Datetime series into a String series with the given format. + + Arguments: + format: Format string for converting the datetime to string. + + Returns: + A new Series with the datetime values formatted as strings according to the specified format. + + Notes: + Unfortunately, different libraries interpret format directives a bit + differently. + + - Chrono, the library used by Polars, uses `"%.f"` for fractional seconds, + whereas pandas and Python stdlib use `".%f"`. + - PyArrow interprets `"%S"` as "seconds, including fractional seconds" + whereas most other tools interpret it as "just seconds, as 2 digits". + + Therefore, we make the following adjustments: + + - for pandas-like libraries, we replace `"%S.%f"` with `"%S%.f"`. + - for PyArrow, we replace `"%S.%f"` with `"%S"`. + + Workarounds like these don't make us happy, and we try to avoid them as + much as possible, but here we feel like it's the best compromise. + + If you just want to format a date/datetime Series as a local datetime + string, and have it work as consistently as possible across libraries, + we suggest using: + + - `"%Y-%m-%dT%H:%M:%S%.f"` for datetimes + - `"%Y-%m-%d"` for dates + + though note that, even then, different tools may return a different number + of trailing zeros. Nonetheless, this is probably consistent enough for + most applications. + + If you have an application where this is not enough, please open an issue + and let us know. + + Examples: + >>> from datetime import datetime + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [ + ... datetime(2020, 3, 1), + ... datetime(2020, 4, 1), + ... datetime(2020, 5, 1), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_to_string(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.to_string("%Y/%m/%d").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_string`: + + >>> agnostic_to_string(s_pd) + 0 2020/03/01 + 1 2020/04/01 + 2 2020/05/01 + dtype: object + + >>> agnostic_to_string(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [str] + [ + "2020/03/01" + "2020/04/01" + "2020/05/01" + ] + + >>> agnostic_to_string(s_pa) # doctest: +ELLIPSIS + + [ + [ + "2020/03/01", + "2020/04/01", + "2020/05/01" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.to_string(format) + ) + + def replace_time_zone(self: Self, time_zone: str | None) -> SeriesT: + """Replace time zone. + + Arguments: + time_zone: Target time zone. + + Returns: + A new Series with the specified time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_replace_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.replace_time_zone("Asia/Kathmandu").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_time_zone`: + + >>> agnostic_replace_time_zone(s_pd) + 0 2024-01-01 00:00:00+05:45 + 1 2024-01-02 00:00:00+05:45 + dtype: datetime64[ns, Asia/Kathmandu] + + >>> agnostic_replace_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs, Asia/Kathmandu]] + [ + 2024-01-01 00:00:00 +0545 + 2024-01-02 00:00:00 +0545 + ] + + >>> agnostic_replace_time_zone(s_pa) + + [ + [ + 2023-12-31 18:15:00.000000Z, + 2024-01-01 18:15:00.000000Z + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.replace_time_zone(time_zone) + ) + + def convert_time_zone(self: Self, time_zone: str) -> SeriesT: + """Convert time zone. + + If converting from a time-zone-naive column, then conversion happens + as if converting from UTC. + + Arguments: + time_zone: Target time zone. + + Returns: + A new Series with the specified time zone. + + Examples: + >>> from datetime import datetime, timezone + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [ + ... datetime(2024, 1, 1, tzinfo=timezone.utc), + ... datetime(2024, 1, 2, tzinfo=timezone.utc), + ... ] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_convert_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.convert_time_zone("Asia/Kathmandu").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_convert_time_zone`: + + >>> agnostic_convert_time_zone(s_pd) + 0 2024-01-01 05:45:00+05:45 + 1 2024-01-02 05:45:00+05:45 + dtype: datetime64[ns, Asia/Kathmandu] + + >>> agnostic_convert_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs, Asia/Kathmandu]] + [ + 2024-01-01 05:45:00 +0545 + 2024-01-02 05:45:00 +0545 + ] + + >>> agnostic_convert_time_zone(s_pa) + + [ + [ + 2024-01-01 00:00:00.000000Z, + 2024-01-02 00:00:00.000000Z + ] + ] + """ + if time_zone is None: + msg = "Target `time_zone` cannot be `None` in `convert_time_zone`. Please use `replace_time_zone(None)` if you want to remove the time zone." + raise TypeError(msg) + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.convert_time_zone(time_zone) + ) + + def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> SeriesT: + """Return a timestamp in the given time unit. + + Arguments: + time_unit: {'ns', 'us', 'ms'} + Time unit. + + Returns: + A new Series with timestamps in the specified time unit. + + Examples: + >>> from datetime import date + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [date(2001, 1, 1), None, date(2001, 1, 3)] + >>> s_pd = pd.Series(data, dtype="datetime64[ns]") + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_timestamp(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.dt.timestamp("ms").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_timestamp`: + + >>> agnostic_timestamp(s_pd) + 0 9.783072e+11 + 1 NaN + 2 9.784800e+11 + dtype: float64 + + >>> agnostic_timestamp(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [i64] + [ + 978307200000 + null + 978480000000 + ] + + >>> agnostic_timestamp(s_pa) + + [ + [ + 978307200000, + null, + 978480000000 + ] + ] + """ + if time_unit not in {"ns", "us", "ms"}: + msg = ( + "invalid `time_unit`" + f"\n\nExpected one of {{'ns', 'us', 'ms'}}, got {time_unit!r}." + ) + raise ValueError(msg) + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.dt.timestamp(time_unit) + ) diff --git a/narwhals/series_list.py b/narwhals/series_list.py new file mode 100644 index 000000000..19de071e8 --- /dev/null +++ b/narwhals/series_list.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.series import Series + + +SeriesT = TypeVar("SeriesT", bound="Series[Any]") + + +class SeriesListNamespace(Generic[SeriesT]): + def __init__(self: Self, series: SeriesT) -> None: + self._narwhals_series = series + + def len(self: Self) -> SeriesT: + """Return the number of elements in each list. + + Null values count towards the total. + + Returns: + A new series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [[1, 2], [3, 4, None], None, []] + + Let's define a dataframe-agnostic function: + + >>> def agnostic_list_len(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.list.len().to_native() + + We can then pass pandas / PyArrow / Polars / any other supported library: + + >>> agnostic_list_len( + ... pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + ... ) # doctest: +SKIP + 0 2 + 1 3 + 2 + 3 0 + dtype: int32[pyarrow] + + >>> agnostic_list_len(pl.Series(data)) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [u32] + [ + 2 + 3 + null + 0 + ] + + >>> agnostic_list_len(pa.chunked_array([data])) # doctest: +ELLIPSIS + + [ + [ + 2, + 3, + null, + 0 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.list.len() + ) diff --git a/narwhals/series_str.py b/narwhals/series_str.py new file mode 100644 index 000000000..737bf09df --- /dev/null +++ b/narwhals/series_str.py @@ -0,0 +1,866 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Generic +from typing import TypeVar + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.series import Series + +SeriesT = TypeVar("SeriesT", bound="Series[Any]") + + +class SeriesStringNamespace(Generic[SeriesT]): + def __init__(self: Self, series: SeriesT) -> None: + self._narwhals_series = series + + def len_chars(self: Self) -> SeriesT: + r"""Return the length of each string as the number of characters. + + Returns: + A new Series containing the length of each string in characters. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["foo", "Café", "345", "東京", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_len_chars(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.len_chars().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_len_chars`: + + >>> agnostic_len_chars(s_pd) + 0 3.0 + 1 4.0 + 2 3.0 + 3 2.0 + 4 NaN + dtype: float64 + + >>> agnostic_len_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 3 + 4 + 3 + 2 + null + ] + + >>> agnostic_len_chars(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 4, + 3, + 2, + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.len_chars() + ) + + def replace( + self: Self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> SeriesT: + r"""Replace first matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + n: Number of matches to replace. + + Returns: + A new Series with the regex/literal pattern replaced with the specified value. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["123abc", "abc abc123"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_replace(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... s = s.str.replace("abc", "") + ... return s.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace`: + + >>> agnostic_replace(s_pd) + 0 123 + 1 abc123 + dtype: object + + >>> agnostic_replace(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [str] + [ + "123" + " abc123" + ] + + >>> agnostic_replace(s_pa) # doctest: +ELLIPSIS + + [ + [ + "123", + " abc123" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.replace( + pattern, value, literal=literal, n=n + ) + ) + + def replace_all( + self: Self, pattern: str, value: str, *, literal: bool = False + ) -> SeriesT: + r"""Replace all matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + + Returns: + A new Series with all occurrences of pattern replaced with the specified value. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["123abc", "abc abc123"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_replace_all(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... s = s.str.replace_all("abc", "") + ... return s.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_all`: + + >>> agnostic_replace_all(s_pd) + 0 123 + 1 123 + dtype: object + + >>> agnostic_replace_all(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [str] + [ + "123" + " 123" + ] + + >>> agnostic_replace_all(s_pa) # doctest: +ELLIPSIS + + [ + [ + "123", + " 123" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.replace_all( + pattern, value, literal=literal + ) + ) + + def strip_chars(self: Self, characters: str | None = None) -> SeriesT: + r"""Remove leading and trailing characters. + + Arguments: + characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead. + + Returns: + A new Series with leading and trailing characters removed. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "\nmango"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_strip_chars(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... s = s.str.strip_chars() + ... return s.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_strip_chars`: + + >>> agnostic_strip_chars(s_pd) + 0 apple + 1 mango + dtype: object + + >>> agnostic_strip_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [str] + [ + "apple" + "mango" + ] + + >>> agnostic_strip_chars(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.strip_chars(characters) + ) + + def starts_with(self: Self, prefix: str) -> SeriesT: + r"""Check if string values start with a substring. + + Arguments: + prefix: prefix substring + + Returns: + A new Series with boolean values indicating if each string starts with the prefix. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "mango", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_starts_with(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.starts_with("app").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_starts_with`: + + >>> agnostic_starts_with(s_pd) + 0 True + 1 False + 2 None + dtype: object + + >>> agnostic_starts_with(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [bool] + [ + true + false + null + ] + + >>> agnostic_starts_with(s_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.starts_with(prefix) + ) + + def ends_with(self: Self, suffix: str) -> SeriesT: + r"""Check if string values end with a substring. + + Arguments: + suffix: suffix substring + + Returns: + A new Series with boolean values indicating if each string ends with the suffix. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "mango", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_ends_with(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.ends_with("ngo").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_ends_with`: + + >>> agnostic_ends_with(s_pd) + 0 False + 1 True + 2 None + dtype: object + + >>> agnostic_ends_with(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [bool] + [ + false + true + null + ] + + >>> agnostic_ends_with(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.ends_with(suffix) + ) + + def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: + r"""Check if string contains a substring that matches a pattern. + + Arguments: + pattern: A Character sequence or valid regular expression pattern. + literal: If True, treats the pattern as a literal string. + If False, assumes the pattern is a regular expression. + + Returns: + A new Series with boolean values indicating if each string contains the pattern. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["cat", "dog", "rabbit and parrot", "dove", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_contains(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.contains("parrot|dove").to_native() + + We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: + + >>> agnostic_contains(s_pd) + 0 False + 1 False + 2 True + 3 True + 4 None + dtype: object + + >>> agnostic_contains(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [bool] + [ + false + false + true + true + null + ] + + >>> agnostic_contains(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + false, + true, + true, + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.contains(pattern, literal=literal) + ) + + def slice(self: Self, offset: int, length: int | None = None) -> SeriesT: + r"""Create subslices of the string values of a Series. + + Arguments: + offset: Start index. Negative indexing is supported. + length: Length of the slice. If set to `None` (default), the slice is taken to the + end of the string. + + Returns: + A new Series containing subslices of each string. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["pear", None, "papaya", "dragonfruit"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.slice(4, length=3).to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_slice`: + + >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 + 1 None + 2 ya + 3 onf + dtype: object + + >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [str] + [ + "" + null + "ya" + "onf" + ] + + >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS + + [ + [ + "", + null, + "ya", + "onf" + ] + ] + + Using negative indexes: + + >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.slice(-3).to_native() + + >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE + 0 ear + 1 None + 2 aya + 3 uit + dtype: object + + >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [str] + [ + "ear" + null + "aya" + "uit" + ] + + >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS + + [ + [ + "ear", + null, + "aya", + "uit" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.slice( + offset=offset, length=length + ) + ) + + def head(self: Self, n: int = 5) -> SeriesT: + r"""Take the first n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is supported (see note (1.)) + + Returns: + A new Series containing the first n characters of each string. + + Notes: + 1. When the `n` input is negative, `head` returns characters up to the n-th from the end of the string. + For example, if `n = -3`, then all characters except the last three are returned. + 2. If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_head(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.head().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_head`: + + >>> agnostic_head(s_pd) + 0 Atata + 1 taata + 2 taata + 3 zukky + dtype: object + + >>> agnostic_head(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [str] + [ + "Atata" + "taata" + "taata" + "zukky" + ] + + >>> agnostic_head(s_pa) # doctest: +ELLIPSIS + + [ + [ + "Atata", + "taata", + "taata", + "zukky" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.slice(offset=0, length=n) + ) + + def tail(self: Self, n: int = 5) -> SeriesT: + r"""Take the last n elements of each string. + + Arguments: + n: Number of elements to take. Negative indexing is supported (see note (1.)) + + Returns: + A new Series containing the last n characters of each string. + + Notes: + 1. When the `n` input is negative, `tail` returns characters starting from the n-th from the beginning of + the string. For example, if `n = -3`, then all characters except the first three are returned. + 2. If the length of the string has fewer than `n` characters, the full string is returned. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_tail(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.tail().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_tail`: + + >>> agnostic_tail(s_pd) + 0 atata + 1 taata + 2 atata + 3 kkyun + dtype: object + + >>> agnostic_tail(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [str] + [ + "atata" + "taata" + "atata" + "kkyun" + ] + + >>> agnostic_tail(s_pa) # doctest: +ELLIPSIS + + [ + [ + "atata", + "taata", + "atata", + "kkyun" + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.slice(offset=-n, length=None) + ) + + def to_uppercase(self) -> SeriesT: + r"""Transform string to uppercase variant. + + Returns: + A new Series with values converted to uppercase. + + Notes: + The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'. + For more info see: https://github.com/apache/arrow/issues/34599 + There may be other unicode-edge-case-related variations across implementations. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "mango", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_to_uppercase(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.to_uppercase().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_uppercase`: + + >>> agnostic_to_uppercase(s_pd) + 0 APPLE + 1 MANGO + 2 None + dtype: object + + >>> agnostic_to_uppercase(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [str] + [ + "APPLE" + "MANGO" + null + ] + + >>> agnostic_to_uppercase(s_pa) # doctest: +ELLIPSIS + + [ + [ + "APPLE", + "MANGO", + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.to_uppercase() + ) + + def to_lowercase(self) -> SeriesT: + r"""Transform string to lowercase variant. + + Returns: + A new Series with values converted to lowercase. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["APPLE", "MANGO", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_to_lowercase(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.to_lowercase().to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_lowercase`: + + >>> agnostic_to_lowercase(s_pd) + 0 apple + 1 mango + 2 None + dtype: object + + >>> agnostic_to_lowercase(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [str] + [ + "apple" + "mango" + null + ] + + >>> agnostic_to_lowercase(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango", + null + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.to_lowercase() + ) + + def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002 + """Parse Series with strings to a Series with Datetime dtype. + + Notes: + pandas defaults to nanosecond time unit, Polars to microsecond. + Prior to pandas 2.0, nanoseconds were the only time unit supported + in pandas, with no ability to set any other one. The ability to + set the time unit in pandas, if the version permits, will arrive. + + Warning: + As different backends auto-infer format in different ways, if `format=None` + there is no guarantee that the result will be equal. + + Arguments: + format: Format to use for conversion. If set to None (default), the format is + inferred from the data. + + Returns: + A new Series with datetime dtype. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["2020-01-01", "2020-01-02"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a dataframe-agnostic function: + + >>> def agnostic_to_datetime(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.to_datetime(format="%Y-%m-%d").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_datetime`: + + >>> agnostic_to_datetime(s_pd) + 0 2020-01-01 + 1 2020-01-02 + dtype: datetime64[ns] + + >>> agnostic_to_datetime(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (2,) + Series: '' [datetime[μs]] + [ + 2020-01-01 00:00:00 + 2020-01-02 00:00:00 + ] + + >>> agnostic_to_datetime(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2020-01-01 00:00:00.000000, + 2020-01-02 00:00:00.000000 + ] + ] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.to_datetime(format=format) + ) From 50b3a40d97a4a904b5ee2d4c39d1f625255f94dc Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Fri, 10 Jan 2025 17:08:40 +0100 Subject: [PATCH 2/2] feat: add `SparkLikeStrNamespace` methods (#1781) * feat: SparkLikeStrNamespace * pyproject --------- Co-authored-by: Marco Edward Gorelli --- narwhals/_spark_like/expr.py | 125 ++++++++++++++++++ pyproject.toml | 1 + tests/conftest.py | 21 +-- tests/expr_and_series/str/contains_test.py | 16 +-- tests/expr_and_series/str/head_test.py | 7 +- tests/expr_and_series/str/len_chars_test.py | 2 +- tests/expr_and_series/str/replace_test.py | 4 +- tests/expr_and_series/str/slice_test.py | 4 - .../str/starts_with_ends_with_test.py | 12 +- tests/expr_and_series/str/strip_chars_test.py | 3 - tests/expr_and_series/str/tail_test.py | 6 +- .../str/to_uppercase_to_lowercase_test.py | 6 - tests/utils.py | 4 +- 13 files changed, 145 insertions(+), 66 deletions(-) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 32139cf01..353261c21 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -480,3 +480,128 @@ def skew(self) -> Self: from pyspark.sql import functions as F # noqa: N812 return self._from_call(F.skewness, "skew", returns_scalar=True) + + @property + def str(self: Self) -> SparkLikeExprStringNamespace: + return SparkLikeExprStringNamespace(self) + + +class SparkLikeExprStringNamespace: + def __init__(self: Self, expr: SparkLikeExpr) -> None: + self._compliant_expr = expr + + def len_chars(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.char_length, + "len", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all( + self: Self, pattern: str, value: str, *, literal: bool = False + ) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, pattern: str, value: str, *, literal: bool) -> Column: + replace_all_func = F.replace if literal else F.regexp_replace + return replace_all_func(_input, F.lit(pattern), F.lit(value)) + + return self._compliant_expr._from_call( + func, + "replace", + pattern=pattern, + value=value, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self: Self, characters: str | None) -> SparkLikeExpr: + import string + + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, characters: str | None) -> Column: + to_remove = characters if characters is not None else string.whitespace + return F.btrim(_input, F.lit(to_remove)) + + return self._compliant_expr._from_call( + func, + "strip", + characters=characters, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def starts_with(self: Self, prefix: str) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + lambda _input, prefix: F.startswith(_input, F.lit(prefix)), + "starts_with", + prefix=prefix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self: Self, suffix: str) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + lambda _input, suffix: F.endswith(_input, F.lit(suffix)), + "ends_with", + suffix=suffix, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self: Self, pattern: str, *, literal: bool) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + def func(_input: Column, pattern: str, *, literal: bool) -> Column: + contains_func = F.contains if literal else F.regexp + return contains_func(_input, F.lit(pattern)) + + return self._compliant_expr._from_call( + func, + "contains", + pattern=pattern, + literal=literal, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def slice(self: Self, offset: int, length: int | None = None) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + # From the docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.substring.html + # The position is not zero based, but 1 based index. + def func(_input: Column, offset: int, length: int | None) -> Column: + col_length = F.char_length(_input) + + _offset = col_length + F.lit(offset + 1) if offset < 0 else F.lit(offset + 1) + _length = F.lit(length) if length is not None else col_length + return _input.substr(_offset, _length) + + return self._compliant_expr._from_call( + func, + "slice", + offset=offset, + length=length, + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.upper, + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_lowercase(self: Self) -> SparkLikeExpr: + from pyspark.sql import functions as F # noqa: N812 + + return self._compliant_expr._from_call( + F.lower, + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) diff --git a/pyproject.toml b/pyproject.toml index 91770923e..37dd36965 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,6 +167,7 @@ filterwarnings = [ 'ignore: unclosed Callable[[Any], IntoFrame]: # pragma: no cove register(session.stop) def _constructor(obj: Any) -> IntoFrame: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - r".*is_datetime64tz_dtype is deprecated and will be removed in a future version.*", - module="pyspark", - category=DeprecationWarning, - ) - pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() - return ( # type: ignore[no-any-return] - session.createDataFrame(pd_df) - .repartition(2) - .orderBy("index") - .drop("index") - ) + pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() + return ( # type: ignore[no-any-return] + session.createDataFrame(pd_df) + .repartition(2) + .orderBy("index") + .drop("index") + ) return _constructor diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index c1024d53a..06c6913aa 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -13,7 +13,7 @@ def test_contains_case_insensitive( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "cudf" in str(constructor) or "pyspark" in str(constructor): + if "cudf" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -40,12 +40,7 @@ def test_contains_series_case_insensitive( assert_equal_data(result, expected) -def test_contains_case_sensitive( - request: pytest.FixtureRequest, constructor: Constructor -) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_contains_case_sensitive(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.col("pets").str.contains("parrot|Dove").alias("default_match")) expected = { @@ -63,12 +58,7 @@ def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> assert_equal_data(result, expected) -def test_contains_literal( - request: pytest.FixtureRequest, constructor: Constructor -) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_contains_literal(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select( nw.col("pets").str.contains("Parrot|dove").alias("default_match"), diff --git a/tests/expr_and_series/str/head_test.py b/tests/expr_and_series/str/head_test.py index 97fbbc6f3..cf6cbd758 100644 --- a/tests/expr_and_series/str/head_test.py +++ b/tests/expr_and_series/str/head_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -import pytest - import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -10,10 +8,7 @@ data = {"a": ["foo", "bars"]} -def test_str_head(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_str_head(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.head(3)) expected = { diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index 812f193b2..1a318801a 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -11,7 +11,7 @@ def test_str_len_chars(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + if "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.len_chars()) diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index 53904be73..4d6da63de 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -123,9 +123,7 @@ def test_str_replace_all_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: - if ("pyspark" in str(constructor)) or ( - "duckdb" in str(constructor) and literal is False - ): + if "duckdb" in str(constructor) and literal is False: request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( diff --git a/tests/expr_and_series/str/slice_test.py b/tests/expr_and_series/str/slice_test.py index 6f9b4dc4f..48936a797 100644 --- a/tests/expr_and_series/str/slice_test.py +++ b/tests/expr_and_series/str/slice_test.py @@ -17,15 +17,11 @@ [(1, 2, {"a": ["da", "df"]}), (-2, None, {"a": ["as", "as"]})], ) def test_str_slice( - request: pytest.FixtureRequest, constructor: Constructor, offset: int, length: int | None, expected: Any, ) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.slice(offset, length)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/starts_with_ends_with_test.py b/tests/expr_and_series/str/starts_with_ends_with_test.py index dac70c288..0b11a7537 100644 --- a/tests/expr_and_series/str/starts_with_ends_with_test.py +++ b/tests/expr_and_series/str/starts_with_ends_with_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -import pytest - import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -13,10 +11,7 @@ data = {"a": ["fdas", "edfas"]} -def test_ends_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_ends_with(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.ends_with("das")) expected = { @@ -34,10 +29,7 @@ def test_ends_with_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_starts_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - +def test_starts_with(constructor: Constructor) -> None: df = nw.from_native(constructor(data)).lazy() result = df.select(nw.col("a").str.starts_with("fda")) expected = { diff --git a/tests/expr_and_series/str/strip_chars_test.py b/tests/expr_and_series/str/strip_chars_test.py index f369bbbf9..785103caa 100644 --- a/tests/expr_and_series/str/strip_chars_test.py +++ b/tests/expr_and_series/str/strip_chars_test.py @@ -20,13 +20,10 @@ ], ) def test_str_strip_chars( - request: pytest.FixtureRequest, constructor: Constructor, characters: str | None, expected: Any, ) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.strip_chars(characters)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/tail_test.py b/tests/expr_and_series/str/tail_test.py index cdb2c024e..e2543de0a 100644 --- a/tests/expr_and_series/str/tail_test.py +++ b/tests/expr_and_series/str/tail_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -import pytest - import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -10,9 +8,7 @@ data = {"a": ["foo", "bars"]} -def test_str_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_str_tail(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) expected = {"a": ["foo", "ars"]} diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 087e26a0e..61566b564 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -30,9 +30,6 @@ def test_str_to_uppercase( expected: dict[str, list[str]], request: pytest.FixtureRequest, ) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) - if any("ß" in s for value in data.values() for s in value) & ( constructor.__name__ in ( @@ -113,13 +110,10 @@ def test_str_to_uppercase_series( ], ) def test_str_to_lowercase( - request: pytest.FixtureRequest, constructor: Constructor, data: dict[str, list[str]], expected: dict[str, list[str]], ) -> None: - if "pyspark" in str(constructor): - request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.to_lowercase()) assert_equal_data(result_frame, expected) diff --git a/tests/utils.py b/tests/utils.py index 2d41d6782..ca727bac0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -65,7 +65,9 @@ def _sort_dict_by_key( data_dict: dict[str, list[Any]], key: str ) -> dict[str, list[Any]]: # pragma: no cover sort_list = data_dict[key] - sorted_indices = sorted(range(len(sort_list)), key=lambda i: sort_list[i]) + sorted_indices = sorted( + range(len(sort_list)), key=lambda i: (sort_list[i] is None, sort_list[i]) + ) return {key: [value[i] for i in sorted_indices] for key, value in data_dict.items()}