From 757fdedeb8c0d44ed9bd2601b80c0573cbe79345 Mon Sep 17 00:00:00 2001 From: raisadz <34237447+raisadz@users.noreply.github.com> Date: Sat, 7 Sep 2024 10:55:35 +0100 Subject: [PATCH] add `by` argument to join_asof --- narwhals/_arrow/dataframe.py | 3 + narwhals/_dask/dataframe.py | 6 + narwhals/_pandas_like/dataframe.py | 6 + narwhals/dataframe.py | 222 ++++++++++++++++++++++++++++- tests/frame/join_test.py | 59 ++++++++ 5 files changed, 292 insertions(+), 4 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index f01ada158..960d833a5 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -322,6 +322,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: msg = "join_asof is not yet supported on PyArrow tables" diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 8f11ccaad..5ef8c5a9d 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -305,6 +305,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -315,6 +318,9 @@ def join_asof( left_on=left_on, right_on=right_on, on=on, + left_by=by_left, + right_by=by_right, + by=by, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 9750cd9d4..3040adda0 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -516,6 +516,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: plx = self.__native_namespace__() @@ -526,6 +529,9 @@ def join_asof( left_on=left_on, right_on=right_on, on=on, + left_by=by_left, + right_by=by_right, + by=by, direction=strategy, suffixes=("", "_right"), ), diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 440856eb4..165b65981 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -221,6 +221,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: _supported_strategies = ("backward", "forward", "nearest") @@ -232,12 +235,30 @@ def join_asof( if left_on is not None and right_on is not None and on is not None: msg = "Either (`left_on` and `right_on`) or `on` keys should be specified." raise ValueError(msg) + if by_left is not None and by_right is not None and by is not None: + msg = "Can not specify `by_left`, `by_right`, and `by` keys at the same time." + raise ValueError(msg) + if by_left is not None and by_right is None and by is None: + msg = "`by_right` can not be None if `by_left` is specified." + raise ValueError(msg) + if by_left is None and by_right is not None and by is None: + msg = "`by_left` can not be None if `by_right` is specified." + raise ValueError(msg) + if ( + (by_left is None and by_right is not None) + or (by_left is not None and by_right is None) + ) and by is not None: + msg = "Either (`by_left` and `by_right_`) or `by` keys should be specified." + raise ValueError(msg) if left_on is not None and right_on is not None: return self._from_compliant_dataframe( self._compliant_frame.join_asof( self._extract_compliant(other), left_on=left_on, right_on=right_on, + by_left=by_left, + by_right=by_right, + by=by, strategy=strategy, ) ) @@ -246,6 +267,9 @@ def join_asof( self._compliant_frame.join_asof( self._extract_compliant(other), on=on, + by_left=by_left, + by_right=by_right, + by=by, strategy=strategy, ) ) @@ -1885,6 +1909,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -1903,6 +1930,12 @@ def join_asof( on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join + + by_right: join on these columns before doing asof join + + by: join on these columns before doing asof join + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -1946,7 +1979,9 @@ def join_asof( >>> @nw.narwhalify ... def join_asof_datetime(df, other_any, strategy): ... return df.join_asof(other_any, on="datetime", strategy=strategy) - >>> # We can now pass either pandas or Polars to the function: + + We can now pass either pandas or Polars to the function: + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 @@ -1964,9 +1999,93 @@ def join_asof( │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ + + Here is a real-world times-series example that uses `by` argument. + + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_quotes = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 30), + ... datetime(2016, 5, 25, 13, 30, 0, 41), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), + ... datetime(2016, 5, 25, 13, 30, 0, 72), + ... datetime(2016, 5, 25, 13, 30, 0, 75), + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + ... } + >>> data_trades = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 38), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + >>> quotes_pd = pd.DataFrame(data_quotes) + >>> trades_pd = pd.DataFrame(data_trades) + >>> quotes_pl = pl.DataFrame(data_quotes).sort("datetime") + >>> trades_pl = pl.DataFrame(data_trades).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: + + >>> @nw.narwhalify + ... def join_asof_datetime_by_ticker(df, other_any): + ... return df.join_asof(other_any, on="datetime", by="ticker") + + We can now pass either pandas or Polars to the function: + + >>> join_asof_datetime_by_ticker(trades_pd, quotes_pd) + datetime ticker price quantity bid ask + 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + + >>> join_asof_datetime_by_ticker(trades_pl, quotes_pl) + shape: (5, 6) + ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ + │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │ + ╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡ + │ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │ + │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ + └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, on=on, strategy=strategy + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, ) # --- descriptive --- @@ -3515,6 +3634,9 @@ def join_asof( left_on: str | None = None, right_on: str | None = None, on: str | None = None, + by_left: str | list[str] | None = None, + by_right: str | list[str] | None = None, + by: str | list[str] | None = None, strategy: Literal["backward", "forward", "nearest"] = "backward", ) -> Self: """ @@ -3533,6 +3655,12 @@ def join_asof( on: Join column of both DataFrames. If set, left_on and right_on should be None. + by_left: join on these columns before doing asof join + + by_right: join on these columns before doing asof join + + by: join on these columns before doing asof join + strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -3575,7 +3703,9 @@ def join_asof( >>> @nw.narwhalify ... def join_asof_datetime(df, other_any, strategy): ... return df.join_asof(other_any, on="datetime", strategy=strategy) - >>> # We can now pass either pandas or Polars to the function: + + We can now pass either pandas or Polars to the function: + >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 @@ -3593,9 +3723,93 @@ def join_asof( │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ + + Here is a real-world times-series example that uses `by` argument. + + >>> from datetime import datetime + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data_quotes = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 30), + ... datetime(2016, 5, 25, 13, 30, 0, 41), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), + ... datetime(2016, 5, 25, 13, 30, 0, 72), + ... datetime(2016, 5, 25, 13, 30, 0, 75), + ... ], + ... "ticker": [ + ... "GOOG", + ... "MSFT", + ... "MSFT", + ... "MSFT", + ... "GOOG", + ... "AAPL", + ... "GOOG", + ... "MSFT", + ... ], + ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + ... } + >>> data_trades = { + ... "datetime": [ + ... datetime(2016, 5, 25, 13, 30, 0, 23), + ... datetime(2016, 5, 25, 13, 30, 0, 38), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... ], + ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], + ... "quantity": [75, 155, 100, 100, 100], + ... } + >>> quotes_pd = pd.DataFrame(data_quotes) + >>> trades_pd = pd.DataFrame(data_trades) + >>> quotes_pl = pl.LazyFrame(data_quotes).sort("datetime") + >>> trades_pl = pl.LazyFrame(data_trades).sort("datetime") + + Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: + + >>> @nw.narwhalify + ... def join_asof_datetime_by_ticker(df, other_any): + ... return df.join_asof(other_any, on="datetime", by="ticker") + + We can now pass either pandas or Polars to the function: + + >>> join_asof_datetime_by_ticker(trades_pd, quotes_pd) + datetime ticker price quantity bid ask + 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + + >>> join_asof_datetime_by_ticker(trades_pl, quotes_pl).collect() + shape: (5, 6) + ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ + │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │ + ╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡ + │ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │ + │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ + └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ """ return super().join_asof( - other, left_on=left_on, right_on=right_on, on=on, strategy=strategy + other, + left_on=left_on, + right_on=right_on, + on=on, + by_left=by_left, + by_right=by_right, + by=by, + strategy=strategy, ) def clone(self) -> Self: diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 72f1304df..34a5961ef 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -324,6 +324,31 @@ def test_joinasof_time(constructor: Any, request: Any) -> None: compare_dicts(result_nearest_on, expected_nearest) +def test_joinasof_by(constructor: Any, request: Any) -> None: + if "pyarrow_table" in str(constructor): + request.applymarker(pytest.mark.xfail) + if parse_version(pd.__version__) < (2, 1) and ( + ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) + ): + request.applymarker(pytest.mark.xfail) + df = nw.from_native( + constructor({"a": [1, 5, 7, 10], "b": ["D", "D", "C", "A"], "c": [9, 2, 1, 1]}) + ).sort("a") + df_right = nw.from_native( + constructor({"a": [1, 4, 5, 8], "b": ["D", "D", "A", "F"], "d": [1, 3, 4, 1]}) + ).sort("a") + result = df.join_asof(df_right, on="a", by_left="b", by_right="b") # type: ignore[arg-type] + result_by = df.join_asof(df_right, on="a", by="b") # type: ignore[arg-type] + expected = { + "a": [1, 5, 7, 10], + "b": ["D", "D", "C", "A"], + "c": [9, 2, 1, 1], + "d": [1, 3, float("nan"), 4], + } + compare_dicts(result, expected) + compare_dicts(result_by, expected) + + @pytest.mark.parametrize("strategy", ["back", "furthest"]) def test_joinasof_not_implemented(constructor: Any, strategy: str) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} @@ -361,3 +386,37 @@ def test_joinasof_no_keys(constructor: Any) -> None: match=msg, ): df.join_asof(df, left_on="a", right_on="a", on="a") # type: ignore[arg-type] + + +def test_joinasof_by_exceptions(constructor: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(constructor(data)) + with pytest.raises( + ValueError, + match=r"Can not specify `by_left`, `by_right`, and `by` keys at the same time.", + ): + df.join_asof(df, on="a", by_left="b", by_right="b", by="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"`by_right` can not be None if `by_left` is specified.", + ): + df.join_asof(df, on="a", by_left="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"`by_left` can not be None if `by_right` is specified.", + ): + df.join_asof(df, on="a", by_right="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + ): + df.join_asof(df, on="a", by_left="b", by="b") # type: ignore[arg-type] + + with pytest.raises( + ValueError, + match=r"Either \(`by_left` and `by_right_`\) or `by` keys should be specified.", + ): + df.join_asof(df, on="a", by_right="b", by="b") # type: ignore[arg-type]