Skip to content

Commit

Permalink
add by argument to join_asof
Browse files Browse the repository at this point in the history
  • Loading branch information
raisadz committed Sep 7, 2024
1 parent 5f91aa1 commit 757fded
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 4 deletions.
3 changes: 3 additions & 0 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
msg = "join_asof is not yet supported on PyArrow tables"
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
plx = self.__native_namespace__()
Expand All @@ -315,6 +318,9 @@ def join_asof(
left_on=left_on,
right_on=right_on,
on=on,
left_by=by_left,
right_by=by_right,
by=by,
direction=strategy,
suffixes=("", "_right"),
),
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
plx = self.__native_namespace__()
Expand All @@ -526,6 +529,9 @@ def join_asof(
left_on=left_on,
right_on=right_on,
on=on,
left_by=by_left,
right_by=by_right,
by=by,
direction=strategy,
suffixes=("", "_right"),
),
Expand Down
222 changes: 218 additions & 4 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
_supported_strategies = ("backward", "forward", "nearest")
Expand All @@ -232,12 +235,30 @@ def join_asof(
if left_on is not None and right_on is not None and on is not None:
msg = "Either (`left_on` and `right_on`) or `on` keys should be specified."
raise ValueError(msg)
if by_left is not None and by_right is not None and by is not None:
msg = "Can not specify `by_left`, `by_right`, and `by` keys at the same time."
raise ValueError(msg)
if by_left is not None and by_right is None and by is None:
msg = "`by_right` can not be None if `by_left` is specified."
raise ValueError(msg)
if by_left is None and by_right is not None and by is None:
msg = "`by_left` can not be None if `by_right` is specified."
raise ValueError(msg)
if (
(by_left is None and by_right is not None)
or (by_left is not None and by_right is None)
) and by is not None:
msg = "Either (`by_left` and `by_right_`) or `by` keys should be specified."
raise ValueError(msg)
if left_on is not None and right_on is not None:
return self._from_compliant_dataframe(
self._compliant_frame.join_asof(
self._extract_compliant(other),
left_on=left_on,
right_on=right_on,
by_left=by_left,
by_right=by_right,
by=by,
strategy=strategy,
)
)
Expand All @@ -246,6 +267,9 @@ def join_asof(
self._compliant_frame.join_asof(
self._extract_compliant(other),
on=on,
by_left=by_left,
by_right=by_right,
by=by,
strategy=strategy,
)
)
Expand Down Expand Up @@ -1885,6 +1909,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
"""
Expand All @@ -1903,6 +1930,12 @@ def join_asof(
on: Join column of both DataFrames. If set, left_on and right_on should be None.
by_left: join on these columns before doing asof join
by_right: join on these columns before doing asof join
by: join on these columns before doing asof join
strategy: Join strategy. The default is "backward".
* *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key.
Expand Down Expand Up @@ -1946,7 +1979,9 @@ def join_asof(
>>> @nw.narwhalify
... def join_asof_datetime(df, other_any, strategy):
... return df.join_asof(other_any, on="datetime", strategy=strategy)
>>> # We can now pass either pandas or Polars to the function:
We can now pass either pandas or Polars to the function:
>>> join_asof_datetime(population_pd, gdp_pd, strategy="backward")
datetime population gdp
0 2016-03-01 82.19 4164
Expand All @@ -1964,9 +1999,93 @@ def join_asof(
│ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │
│ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │
└─────────────────────┴────────────┴──────┘
Here is a real-world times-series example that uses `by` argument.
>>> from datetime import datetime
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data_quotes = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 30),
... datetime(2016, 5, 25, 13, 30, 0, 41),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 49),
... datetime(2016, 5, 25, 13, 30, 0, 72),
... datetime(2016, 5, 25, 13, 30, 0, 75),
... ],
... "ticker": [
... "GOOG",
... "MSFT",
... "MSFT",
... "MSFT",
... "GOOG",
... "AAPL",
... "GOOG",
... "MSFT",
... ],
... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
... }
>>> data_trades = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 38),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... ],
... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
... "quantity": [75, 155, 100, 100, 100],
... }
>>> quotes_pd = pd.DataFrame(data_quotes)
>>> trades_pd = pd.DataFrame(data_trades)
>>> quotes_pl = pl.DataFrame(data_quotes).sort("datetime")
>>> trades_pl = pl.DataFrame(data_trades).sort("datetime")
Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns:
>>> @nw.narwhalify
... def join_asof_datetime_by_ticker(df, other_any):
... return df.join_asof(other_any, on="datetime", by="ticker")
We can now pass either pandas or Polars to the function:
>>> join_asof_datetime_by_ticker(trades_pd, quotes_pd)
datetime ticker price quantity bid ask
0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96
1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98
2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93
3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93
4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN
>>> join_asof_datetime_by_ticker(trades_pl, quotes_pl)
shape: (5, 6)
┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐
│ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │
╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡
│ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │
│ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │
│ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │
│ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │
│ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │
└────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘
"""
return super().join_asof(
other, left_on=left_on, right_on=right_on, on=on, strategy=strategy
other,
left_on=left_on,
right_on=right_on,
on=on,
by_left=by_left,
by_right=by_right,
by=by,
strategy=strategy,
)

# --- descriptive ---
Expand Down Expand Up @@ -3515,6 +3634,9 @@ def join_asof(
left_on: str | None = None,
right_on: str | None = None,
on: str | None = None,
by_left: str | list[str] | None = None,
by_right: str | list[str] | None = None,
by: str | list[str] | None = None,
strategy: Literal["backward", "forward", "nearest"] = "backward",
) -> Self:
"""
Expand All @@ -3533,6 +3655,12 @@ def join_asof(
on: Join column of both DataFrames. If set, left_on and right_on should be None.
by_left: join on these columns before doing asof join
by_right: join on these columns before doing asof join
by: join on these columns before doing asof join
strategy: Join strategy. The default is "backward".
* *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key.
Expand Down Expand Up @@ -3575,7 +3703,9 @@ def join_asof(
>>> @nw.narwhalify
... def join_asof_datetime(df, other_any, strategy):
... return df.join_asof(other_any, on="datetime", strategy=strategy)
>>> # We can now pass either pandas or Polars to the function:
We can now pass either pandas or Polars to the function:
>>> join_asof_datetime(population_pd, gdp_pd, strategy="backward")
datetime population gdp
0 2016-03-01 82.19 4164
Expand All @@ -3593,9 +3723,93 @@ def join_asof(
│ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │
│ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │
└─────────────────────┴────────────┴──────┘
Here is a real-world times-series example that uses `by` argument.
>>> from datetime import datetime
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data_quotes = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 30),
... datetime(2016, 5, 25, 13, 30, 0, 41),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 49),
... datetime(2016, 5, 25, 13, 30, 0, 72),
... datetime(2016, 5, 25, 13, 30, 0, 75),
... ],
... "ticker": [
... "GOOG",
... "MSFT",
... "MSFT",
... "MSFT",
... "GOOG",
... "AAPL",
... "GOOG",
... "MSFT",
... ],
... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
... }
>>> data_trades = {
... "datetime": [
... datetime(2016, 5, 25, 13, 30, 0, 23),
... datetime(2016, 5, 25, 13, 30, 0, 38),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... datetime(2016, 5, 25, 13, 30, 0, 48),
... ],
... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
... "quantity": [75, 155, 100, 100, 100],
... }
>>> quotes_pd = pd.DataFrame(data_quotes)
>>> trades_pd = pd.DataFrame(data_trades)
>>> quotes_pl = pl.LazyFrame(data_quotes).sort("datetime")
>>> trades_pl = pl.LazyFrame(data_trades).sort("datetime")
Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns:
>>> @nw.narwhalify
... def join_asof_datetime_by_ticker(df, other_any):
... return df.join_asof(other_any, on="datetime", by="ticker")
We can now pass either pandas or Polars to the function:
>>> join_asof_datetime_by_ticker(trades_pd, quotes_pd)
datetime ticker price quantity bid ask
0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96
1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98
2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93
3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93
4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN
>>> join_asof_datetime_by_ticker(trades_pl, quotes_pl).collect()
shape: (5, 6)
┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐
│ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ str ┆ f64 ┆ i64 ┆ f64 ┆ f64 │
╞════════════════════════════╪════════╪════════╪══════════╪═══════╪════════╡
│ 2016-05-25 13:30:00.000023 ┆ MSFT ┆ 51.95 ┆ 75 ┆ 51.95 ┆ 51.96 │
│ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │
│ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │
│ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │
│ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │
└────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘
"""
return super().join_asof(
other, left_on=left_on, right_on=right_on, on=on, strategy=strategy
other,
left_on=left_on,
right_on=right_on,
on=on,
by_left=by_left,
by_right=by_right,
by=by,
strategy=strategy,
)

def clone(self) -> Self:
Expand Down
Loading

0 comments on commit 757fded

Please sign in to comment.