From e5be1a1d474261e7f1af35a06a0552fec664e5b6 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:11:15 +0200 Subject: [PATCH 01/20] perf: codspeed ci setup --- .github/workflows/codspeed.yml | 28 +++++++++++ tests/tpch/__init__.py | 0 tests/tpch/q1_test.py | 92 ++++++++++++++++++++++++++++++++++ tests/tpch/q2_test.py | 91 +++++++++++++++++++++++++++++++++ 4 files changed, 211 insertions(+) create mode 100644 .github/workflows/codspeed.yml create mode 100644 tests/tpch/__init__.py create mode 100644 tests/tpch/q1_test.py create mode 100644 tests/tpch/q2_test.py diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 000000000..d4906a45e --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,28 @@ +name: codspeed benchmarks + +on: + pull_request: + workflow_dispatch: + push: + branches: + - main + +jobs: + codspeed-benchmarks: + name: codspeed benchmarks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: Install dependencies + run: uv pip install -r requirements-dev.txt --system + - name: show-deps + run: uv pip freeze + - name: Run benchmarks + uses: CodSpeedHQ/action@v3 + with: + run: pytest tests/tpch --codspeed \ No newline at end of file diff --git a/tests/tpch/__init__.py b/tests/tpch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/tpch/q1_test.py b/tests/tpch/q1_test.py new file mode 100644 index 000000000..9facffd87 --- /dev/null +++ b/tests/tpch/q1_test.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from datetime import datetime +from typing import Any + +import pandas as pd +import polars as pl +import pyarrow.parquet as pq +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version +from tests.utils import compare_dicts + +q1_expected = { + "l_returnflag": ["A", "N", "N", "R"], + "l_linestatus": ["F", "F", "O", "F"], + "sum_qty": [2109.0, 29.0, 3682.0, 1876.0], + "sum_base_price": [3114026.44, 39824.83, 5517101.99, 2947892.16], + "sum_disc_price": [2954950.8082, 39028.3334, 5205468.4852, 2816542.4816999994], + "sum_charge": [ + 3092840.4194289995, + 39808.900068, + 5406966.873419, + 2935797.8313019997, + ], + "avg_qty": [27.75, 29.0, 25.047619047619047, 26.422535211267604], + "avg_price": [ + 40974.032105263155, + 39824.83, + 37531.30605442177, + 41519.607887323946, + ], + "avg_disc": [0.05039473684210526, 0.02, 0.05537414965986395, 0.04507042253521127], + "count_order": [76, 1, 147, 71], +} + + +def q1(library: str) -> dict[str, list[Any]]: + if library == "pandas": + df_raw = pd.read_parquet("tests/data/lineitem.parquet") + df_raw["l_shipdate"] = pd.to_datetime(df_raw["l_shipdate"]) + elif library == "polars": + df_raw = pl.scan_parquet("tests/data/lineitem.parquet") + elif library == "dask": + pytest.importorskip("dask") + pytest.importorskip("dask_expr", exc_type=ImportError) + import dask.dataframe as dd + + df_raw = dd.read_parquet("tests/data/lineitem.parquet", dtype_backend="pyarrow") + elif library == "pyarrow": + df_raw = pq.read_table("tests/data/lineitem.parquet") + + var_1 = datetime(1998, 9, 2) + df = nw.from_native(df_raw).lazy() + query_result = ( + df.filter(nw.col("l_shipdate") <= var_1) + .with_columns( + disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), + charge=( + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) + ), + ) + .group_by(["l_returnflag", "l_linestatus"]) + .agg( + [ + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + nw.col("disc_price").sum().alias("sum_disc_price"), + nw.col("charge").sum().alias("sum_charge"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), + ], + ) + .sort(["l_returnflag", "l_linestatus"]) + ) + return query_result.collect().to_dict(as_series=False) + + +@pytest.mark.benchmark() +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q1(benchmark: Any, library: str, request: Any) -> None: + if library == "pandas" and parse_version(pd.__version__) < (1, 5): + request.applymarker(pytest.mark.xfail) + + result = benchmark(q1, library) + + compare_dicts(result, q1_expected) diff --git a/tests/tpch/q2_test.py b/tests/tpch/q2_test.py new file mode 100644 index 000000000..57bc8e2da --- /dev/null +++ b/tests/tpch/q2_test.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import Any + +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + +lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pyarrow": pq.read_table, +} + + +def q2( + region_ds: Any, + nation_ds: Any, + supplier_ds: Any, + part_ds: Any, + part_supp_ds: Any, +) -> dict[str, list[Any]]: + var_1 = 15 + var_2 = "BRASS" + var_3 = "EUROPE" + + tmp = ( + part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey") + .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") + .join(region_ds, left_on="n_regionkey", right_on="r_regionkey") + .filter( + nw.col("p_size") == var_1, + nw.col("p_type").str.ends_with(var_2), + nw.col("r_name") == var_3, + ) + ) + + final_cols = [ + "s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment", + ] + + query_result = ( + tmp.group_by("p_partkey") + .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) + .join( + tmp, + left_on=["p_partkey", "ps_supplycost"], + right_on=["p_partkey", "ps_supplycost"], + ) + .select(final_cols) + .sort( + ["s_acctbal", "n_name", "s_name", "p_partkey"], + descending=[True, False, False, False], + ) + .head(100) + ) + return query_result.collect().to_dict(as_series=False) + + +@pytest.mark.benchmark() +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q2(benchmark: Any, library: str, request: Any) -> None: + if library == "pandas" and parse_version(pd.__version__) < (1, 5): + request.applymarker(pytest.mark.xfail) + + read_fn = lib_to_reader[library] + region_ds = nw.from_native(read_fn("tests/data/region.parquet")).lazy() + nation_ds = nw.from_native(read_fn("tests/data/nation.parquet")).lazy() + supplier_ds = nw.from_native(read_fn("tests/data/supplier.parquet")).lazy() + part_ds = nw.from_native(read_fn("tests/data/part.parquet")).lazy() + part_supp_ds = nw.from_native(read_fn("tests/data/partsupp.parquet")).lazy() + + args = (region_ds, nation_ds, supplier_ds, part_ds, part_supp_ds) + + _ = benchmark(q2, *args) + + # Need to create expected compare_dicts(result, q2_expected) From d11c38fdb89f8bd136e91839615c729ca0cbed49 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:15:47 +0200 Subject: [PATCH 02/20] dev req --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index c64b2b83a..f3d7d8207 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ polars[timezones] pre-commit pyarrow pytest +pytest-codspeed pytest-cov pytest-env hypothesis From 20d079e9f961439de651d261ccadbb6ba78b8e67 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:25:19 +0200 Subject: [PATCH 03/20] add token --- .github/workflows/codspeed.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index d4906a45e..ac73fda6d 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -25,4 +25,5 @@ jobs: - name: Run benchmarks uses: CodSpeedHQ/action@v3 with: + token: ${{ secrets.CODSPEED_TOKEN }} run: pytest tests/tpch --codspeed \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index f3d7d8207..00b63744a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ polars[timezones] pre-commit pyarrow pytest +pytest-benchmark pytest-codspeed pytest-cov pytest-env From a3d432abd746f752bdf2ab18c01bfb391d21b4cd Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:27:03 +0200 Subject: [PATCH 04/20] try different py version --- .github/workflows/codspeed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index ac73fda6d..5acd592ab 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.10" - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Install dependencies From bf298cdc08869baa588de0b3e2cc286d5f508963 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:27:35 +0200 Subject: [PATCH 05/20] try different py version --- .github/workflows/codspeed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 5acd592ab..350eaf13f 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -19,7 +19,7 @@ jobs: - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Install dependencies - run: uv pip install -r requirements-dev.txt --system + run: uv pip install -e . -r requirements-dev.txt --system - name: show-deps run: uv pip freeze - name: Run benchmarks From 1a5022dd5d0079ef249f30197ed1ea1f7879dc60 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:35:39 +0200 Subject: [PATCH 06/20] refactor --- tests/tpch/q1_test.py | 59 ++++++++++--------------------------------- tests/tpch/q2_test.py | 5 ++-- 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/tests/tpch/q1_test.py b/tests/tpch/q1_test.py index 9facffd87..cbeb3ac85 100644 --- a/tests/tpch/q1_test.py +++ b/tests/tpch/q1_test.py @@ -1,8 +1,9 @@ from __future__ import annotations -from datetime import datetime +from datetime import date from typing import Any +import dask.dataframe as dd import pandas as pd import polars as pl import pyarrow.parquet as pq @@ -10,51 +11,19 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version -from tests.utils import compare_dicts -q1_expected = { - "l_returnflag": ["A", "N", "N", "R"], - "l_linestatus": ["F", "F", "O", "F"], - "sum_qty": [2109.0, 29.0, 3682.0, 1876.0], - "sum_base_price": [3114026.44, 39824.83, 5517101.99, 2947892.16], - "sum_disc_price": [2954950.8082, 39028.3334, 5205468.4852, 2816542.4816999994], - "sum_charge": [ - 3092840.4194289995, - 39808.900068, - 5406966.873419, - 2935797.8313019997, - ], - "avg_qty": [27.75, 29.0, 25.047619047619047, 26.422535211267604], - "avg_price": [ - 40974.032105263155, - 39824.83, - 37531.30605442177, - 41519.607887323946, - ], - "avg_disc": [0.05039473684210526, 0.02, 0.05537414965986395, 0.04507042253521127], - "count_order": [76, 1, 147, 71], +lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pyarrow": pq.read_table, } -def q1(library: str) -> dict[str, list[Any]]: - if library == "pandas": - df_raw = pd.read_parquet("tests/data/lineitem.parquet") - df_raw["l_shipdate"] = pd.to_datetime(df_raw["l_shipdate"]) - elif library == "polars": - df_raw = pl.scan_parquet("tests/data/lineitem.parquet") - elif library == "dask": - pytest.importorskip("dask") - pytest.importorskip("dask_expr", exc_type=ImportError) - import dask.dataframe as dd - - df_raw = dd.read_parquet("tests/data/lineitem.parquet", dtype_backend="pyarrow") - elif library == "pyarrow": - df_raw = pq.read_table("tests/data/lineitem.parquet") - - var_1 = datetime(1998, 9, 2) - df = nw.from_native(df_raw).lazy() +def q1(lineitem_ds: Any) -> Any: + var_1 = date(1998, 9, 2) query_result = ( - df.filter(nw.col("l_shipdate") <= var_1) + lineitem_ds.filter(nw.col("l_shipdate") <= var_1) .with_columns( disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), charge=( @@ -78,15 +47,15 @@ def q1(library: str) -> dict[str, list[Any]]: ) .sort(["l_returnflag", "l_linestatus"]) ) - return query_result.collect().to_dict(as_series=False) + return query_result.collect() -@pytest.mark.benchmark() @pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) def test_q1(benchmark: Any, library: str, request: Any) -> None: if library == "pandas" and parse_version(pd.__version__) < (1, 5): request.applymarker(pytest.mark.xfail) - result = benchmark(q1, library) + read_fn = lib_to_reader[library] + lineitem_ds = nw.from_native(read_fn("tests/data/lineitem.parquet")).lazy() - compare_dicts(result, q1_expected) + _ = benchmark(q1, lineitem_ds) diff --git a/tests/tpch/q2_test.py b/tests/tpch/q2_test.py index 57bc8e2da..2786e4ec7 100644 --- a/tests/tpch/q2_test.py +++ b/tests/tpch/q2_test.py @@ -25,7 +25,7 @@ def q2( supplier_ds: Any, part_ds: Any, part_supp_ds: Any, -) -> dict[str, list[Any]]: +) -> Any: var_1 = 15 var_2 = "BRASS" var_3 = "EUROPE" @@ -68,10 +68,9 @@ def q2( ) .head(100) ) - return query_result.collect().to_dict(as_series=False) + return query_result.collect() -@pytest.mark.benchmark() @pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) def test_q2(benchmark: Any, library: str, request: Any) -> None: if library == "pandas" and parse_version(pd.__version__) < (1, 5): From 6c970e8019a93b5885775587baf1336f813431bd Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Sun, 18 Aug 2024 22:39:17 +0200 Subject: [PATCH 07/20] rm dask from main imports --- tests/tpch/q1_test.py | 17 +++++++++-------- tests/tpch/q2_test.py | 17 +++++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/tpch/q1_test.py b/tests/tpch/q1_test.py index cbeb3ac85..db103df86 100644 --- a/tests/tpch/q1_test.py +++ b/tests/tpch/q1_test.py @@ -3,7 +3,6 @@ from datetime import date from typing import Any -import dask.dataframe as dd import pandas as pd import polars as pl import pyarrow.parquet as pq @@ -12,13 +11,6 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version -lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, -} - def q1(lineitem_ds: Any) -> Any: var_1 = date(1998, 9, 2) @@ -55,6 +47,15 @@ def test_q1(benchmark: Any, library: str, request: Any) -> None: if library == "pandas" and parse_version(pd.__version__) < (1, 5): request.applymarker(pytest.mark.xfail) + import dask.dataframe as dd + + lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pyarrow": pq.read_table, + } + read_fn = lib_to_reader[library] lineitem_ds = nw.from_native(read_fn("tests/data/lineitem.parquet")).lazy() diff --git a/tests/tpch/q2_test.py b/tests/tpch/q2_test.py index 2786e4ec7..36b599332 100644 --- a/tests/tpch/q2_test.py +++ b/tests/tpch/q2_test.py @@ -2,7 +2,6 @@ from typing import Any -import dask.dataframe as dd import pandas as pd import polars as pl import pyarrow.parquet as pq @@ -11,13 +10,6 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version -lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, -} - def q2( region_ds: Any, @@ -76,6 +68,15 @@ def test_q2(benchmark: Any, library: str, request: Any) -> None: if library == "pandas" and parse_version(pd.__version__) < (1, 5): request.applymarker(pytest.mark.xfail) + import dask.dataframe as dd + + lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pyarrow": pq.read_table, + } + read_fn = lib_to_reader[library] region_ds = nw.from_native(read_fn("tests/data/region.parquet")).lazy() nation_ds = nw.from_native(read_fn("tests/data/nation.parquet")).lazy() From eb2c34336a1673e03a1991368690a8350904f99c Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 16:38:07 +0200 Subject: [PATCH 08/20] add tags --- .github/workflows/codspeed.yml | 3 +-- pyproject.toml | 8 +++++++- tests/tpch/q1_test.py | 2 ++ tests/tpch/q2_test.py | 2 ++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 350eaf13f..839b5bec5 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -25,5 +25,4 @@ jobs: - name: Run benchmarks uses: CodSpeedHQ/action@v3 with: - token: ${{ secrets.CODSPEED_TOKEN }} - run: pytest tests/tpch --codspeed \ No newline at end of file + run: pytest tests/tpch -m "benchmark" --codspeed \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d05e07c84..9c94e41d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,10 +96,16 @@ filterwarnings = [ 'ignore:.*but when imported by', ] xfail_strict = true -markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "benchmark: marks tests as benchmark (deselect with '-m \"not benchmark\"')" +] env = [ "MODIN_ENGINE=python", ] +addopts = [ + "-m not benchmark", +] [tool.coverage.run] plugins = ["covdefaults"] diff --git a/tests/tpch/q1_test.py b/tests/tpch/q1_test.py index db103df86..f65327054 100644 --- a/tests/tpch/q1_test.py +++ b/tests/tpch/q1_test.py @@ -11,6 +11,8 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +pytestmark = pytest.mark.benchmark + def q1(lineitem_ds: Any) -> Any: var_1 = date(1998, 9, 2) diff --git a/tests/tpch/q2_test.py b/tests/tpch/q2_test.py index 36b599332..bdad5f814 100644 --- a/tests/tpch/q2_test.py +++ b/tests/tpch/q2_test.py @@ -10,6 +10,8 @@ import narwhals.stable.v1 as nw from narwhals.utils import parse_version +pytestmark = pytest.mark.benchmark + def q2( region_ds: Any, From db13d7c0246acb78458b20276e64a1d6d7c1b7e7 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 16:44:51 +0200 Subject: [PATCH 09/20] rm editable installation --- .github/workflows/codspeed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 839b5bec5..9ef979273 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -19,7 +19,7 @@ jobs: - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Install dependencies - run: uv pip install -e . -r requirements-dev.txt --system + run: uv pip install -r requirements-dev.txt --system - name: show-deps run: uv pip freeze - name: Run benchmarks From 084818fa5944e7fff702007c0504a91daa98248a Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:00:22 +0200 Subject: [PATCH 10/20] move to tpch folder --- .github/workflows/codspeed.yml | 4 ++-- pyproject.toml | 8 +------- tpch/scripts/__init__.py | 0 {tests/tpch => tpch/scripts}/q1_test.py | 6 +----- {tests/tpch => tpch/scripts}/q2_test.py | 8 +------- 5 files changed, 5 insertions(+), 21 deletions(-) create mode 100644 tpch/scripts/__init__.py rename {tests/tpch => tpch/scripts}/q1_test.py (89%) rename {tests/tpch => tpch/scripts}/q2_test.py (89%) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 9ef979273..b7549263c 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.12" - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Install dependencies @@ -25,4 +25,4 @@ jobs: - name: Run benchmarks uses: CodSpeedHQ/action@v3 with: - run: pytest tests/tpch -m "benchmark" --codspeed \ No newline at end of file + run: pytest tpch/scripts --codspeed \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9c94e41d0..d05e07c84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,16 +96,10 @@ filterwarnings = [ 'ignore:.*but when imported by', ] xfail_strict = true -markers = [ - "slow: marks tests as slow (deselect with '-m \"not slow\"')", - "benchmark: marks tests as benchmark (deselect with '-m \"not benchmark\"')" -] +markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] env = [ "MODIN_ENGINE=python", ] -addopts = [ - "-m not benchmark", -] [tool.coverage.run] plugins = ["covdefaults"] diff --git a/tpch/scripts/__init__.py b/tpch/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/tpch/q1_test.py b/tpch/scripts/q1_test.py similarity index 89% rename from tests/tpch/q1_test.py rename to tpch/scripts/q1_test.py index f65327054..53b20895c 100644 --- a/tests/tpch/q1_test.py +++ b/tpch/scripts/q1_test.py @@ -9,7 +9,6 @@ import pytest import narwhals.stable.v1 as nw -from narwhals.utils import parse_version pytestmark = pytest.mark.benchmark @@ -45,10 +44,7 @@ def q1(lineitem_ds: Any) -> Any: @pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q1(benchmark: Any, library: str, request: Any) -> None: - if library == "pandas" and parse_version(pd.__version__) < (1, 5): - request.applymarker(pytest.mark.xfail) - +def test_q1(benchmark: Any, library: str) -> None: import dask.dataframe as dd lib_to_reader = { diff --git a/tests/tpch/q2_test.py b/tpch/scripts/q2_test.py similarity index 89% rename from tests/tpch/q2_test.py rename to tpch/scripts/q2_test.py index bdad5f814..20af01b19 100644 --- a/tests/tpch/q2_test.py +++ b/tpch/scripts/q2_test.py @@ -8,7 +8,6 @@ import pytest import narwhals.stable.v1 as nw -from narwhals.utils import parse_version pytestmark = pytest.mark.benchmark @@ -66,10 +65,7 @@ def q2( @pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q2(benchmark: Any, library: str, request: Any) -> None: - if library == "pandas" and parse_version(pd.__version__) < (1, 5): - request.applymarker(pytest.mark.xfail) - +def test_q2(benchmark: Any, library: str) -> None: import dask.dataframe as dd lib_to_reader = { @@ -89,5 +85,3 @@ def test_q2(benchmark: Any, library: str, request: Any) -> None: args = (region_ds, nation_ds, supplier_ds, part_ds, part_supp_ds) _ = benchmark(q2, *args) - - # Need to create expected compare_dicts(result, q2_expected) From a3379df816a2467f5b8788a920da7871f1e5a0ce Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:05:08 +0200 Subject: [PATCH 11/20] rm extra dep and check only q1 --- requirements-dev.txt | 1 - tpch/scripts/q1_test.py | 2 - tpch/scripts/q2_test.py | 87 ----------------------------------------- 3 files changed, 90 deletions(-) delete mode 100644 tpch/scripts/q2_test.py diff --git a/requirements-dev.txt b/requirements-dev.txt index 01dc97e75..baf54fcf4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,6 @@ polars pre-commit pyarrow pytest -pytest-benchmark pytest-codspeed pytest-cov pytest-env diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py index 53b20895c..47f8fc028 100644 --- a/tpch/scripts/q1_test.py +++ b/tpch/scripts/q1_test.py @@ -10,8 +10,6 @@ import narwhals.stable.v1 as nw -pytestmark = pytest.mark.benchmark - def q1(lineitem_ds: Any) -> Any: var_1 = date(1998, 9, 2) diff --git a/tpch/scripts/q2_test.py b/tpch/scripts/q2_test.py deleted file mode 100644 index 20af01b19..000000000 --- a/tpch/scripts/q2_test.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pandas as pd -import polars as pl -import pyarrow.parquet as pq -import pytest - -import narwhals.stable.v1 as nw - -pytestmark = pytest.mark.benchmark - - -def q2( - region_ds: Any, - nation_ds: Any, - supplier_ds: Any, - part_ds: Any, - part_supp_ds: Any, -) -> Any: - var_1 = 15 - var_2 = "BRASS" - var_3 = "EUROPE" - - tmp = ( - part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey") - .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") - .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") - .join(region_ds, left_on="n_regionkey", right_on="r_regionkey") - .filter( - nw.col("p_size") == var_1, - nw.col("p_type").str.ends_with(var_2), - nw.col("r_name") == var_3, - ) - ) - - final_cols = [ - "s_acctbal", - "s_name", - "n_name", - "p_partkey", - "p_mfgr", - "s_address", - "s_phone", - "s_comment", - ] - - query_result = ( - tmp.group_by("p_partkey") - .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) - .join( - tmp, - left_on=["p_partkey", "ps_supplycost"], - right_on=["p_partkey", "ps_supplycost"], - ) - .select(final_cols) - .sort( - ["s_acctbal", "n_name", "s_name", "p_partkey"], - descending=[True, False, False, False], - ) - .head(100) - ) - return query_result.collect() - - -@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q2(benchmark: Any, library: str) -> None: - import dask.dataframe as dd - - lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, - } - - read_fn = lib_to_reader[library] - region_ds = nw.from_native(read_fn("tests/data/region.parquet")).lazy() - nation_ds = nw.from_native(read_fn("tests/data/nation.parquet")).lazy() - supplier_ds = nw.from_native(read_fn("tests/data/supplier.parquet")).lazy() - part_ds = nw.from_native(read_fn("tests/data/part.parquet")).lazy() - part_supp_ds = nw.from_native(read_fn("tests/data/partsupp.parquet")).lazy() - - args = (region_ds, nation_ds, supplier_ds, part_ds, part_supp_ds) - - _ = benchmark(q2, *args) From d72d85eff4b50830208f827dcef7ca0ff995dd1f Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:12:16 +0200 Subject: [PATCH 12/20] dumb example --- tpch/scripts/naive_test.py | 11 ++++++++ tpch/scripts/q1_test.py | 58 -------------------------------------- 2 files changed, 11 insertions(+), 58 deletions(-) create mode 100644 tpch/scripts/naive_test.py delete mode 100644 tpch/scripts/q1_test.py diff --git a/tpch/scripts/naive_test.py b/tpch/scripts/naive_test.py new file mode 100644 index 000000000..c175a657b --- /dev/null +++ b/tpch/scripts/naive_test.py @@ -0,0 +1,11 @@ +from statistics import mean +from typing import Any + + +def test_mean_performance(benchmark: Any) -> None: + # Precompute some data useful for the benchmark but that should not be + # included in the benchmark time + data = [1, 2, 3, 4, 5] + + # Benchmark the execution of the function + benchmark(lambda: mean(data)) diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py deleted file mode 100644 index 47f8fc028..000000000 --- a/tpch/scripts/q1_test.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -from datetime import date -from typing import Any - -import pandas as pd -import polars as pl -import pyarrow.parquet as pq -import pytest - -import narwhals.stable.v1 as nw - - -def q1(lineitem_ds: Any) -> Any: - var_1 = date(1998, 9, 2) - query_result = ( - lineitem_ds.filter(nw.col("l_shipdate") <= var_1) - .with_columns( - disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), - charge=( - nw.col("l_extendedprice") - * (1.0 - nw.col("l_discount")) - * (1.0 + nw.col("l_tax")) - ), - ) - .group_by(["l_returnflag", "l_linestatus"]) - .agg( - [ - nw.col("l_quantity").sum().alias("sum_qty"), - nw.col("l_extendedprice").sum().alias("sum_base_price"), - nw.col("disc_price").sum().alias("sum_disc_price"), - nw.col("charge").sum().alias("sum_charge"), - nw.col("l_quantity").mean().alias("avg_qty"), - nw.col("l_extendedprice").mean().alias("avg_price"), - nw.col("l_discount").mean().alias("avg_disc"), - nw.len().alias("count_order"), - ], - ) - .sort(["l_returnflag", "l_linestatus"]) - ) - return query_result.collect() - - -@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q1(benchmark: Any, library: str) -> None: - import dask.dataframe as dd - - lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, - } - - read_fn = lib_to_reader[library] - lineitem_ds = nw.from_native(read_fn("tests/data/lineitem.parquet")).lazy() - - _ = benchmark(q1, lineitem_ds) From be37a763132afda91b4c0708ee8e0c80b4905fc0 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:31:48 +0200 Subject: [PATCH 13/20] pandas and polars only --- tpch/scripts/q1_test.py | 54 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tpch/scripts/q1_test.py diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py new file mode 100644 index 000000000..b28680920 --- /dev/null +++ b/tpch/scripts/q1_test.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from datetime import date +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw + + +def q1(lineitem_ds: Any) -> Any: + var_1 = date(1998, 9, 2) + query_result = ( + lineitem_ds.filter(nw.col("l_shipdate") <= var_1) + .with_columns( + disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), + charge=( + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) + ), + ) + .group_by(["l_returnflag", "l_linestatus"]) + .agg( + [ + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + nw.col("disc_price").sum().alias("sum_disc_price"), + nw.col("charge").sum().alias("sum_charge"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), + ], + ) + .sort(["l_returnflag", "l_linestatus"]) + ) + return query_result.collect() + + +@pytest.mark.parametrize("library", ["pandas", "polars"]) +def test_q1(benchmark: Any, library: str) -> None: + import pandas as pd + import polars as pl + + lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + } + + read_fn = lib_to_reader[library] + lineitem_ds = nw.from_native(read_fn("tests/data/lineitem.parquet")).lazy() + + _ = benchmark(q1, lineitem_ds) From d2d9559a889ffd0dbf63e5bf27f5711cb01ac755 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:33:55 +0200 Subject: [PATCH 14/20] ok just polars then --- tpch/scripts/q1_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py index b28680920..44a256987 100644 --- a/tpch/scripts/q1_test.py +++ b/tpch/scripts/q1_test.py @@ -38,13 +38,11 @@ def q1(lineitem_ds: Any) -> Any: return query_result.collect() -@pytest.mark.parametrize("library", ["pandas", "polars"]) +@pytest.mark.parametrize("library", ["polars"]) def test_q1(benchmark: Any, library: str) -> None: - import pandas as pd import polars as pl lib_to_reader = { - "pandas": pd.read_parquet, "polars": pl.scan_parquet, } From 3a114621ff8e74882d26444b9302b71acbe00a74 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:37:13 +0200 Subject: [PATCH 15/20] also pyarrow --- tpch/scripts/q1_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py index 44a256987..262cb96b7 100644 --- a/tpch/scripts/q1_test.py +++ b/tpch/scripts/q1_test.py @@ -3,6 +3,8 @@ from datetime import date from typing import Any +import polars as pl +import pyarrow.parquet as pq import pytest import narwhals.stable.v1 as nw @@ -38,12 +40,11 @@ def q1(lineitem_ds: Any) -> Any: return query_result.collect() -@pytest.mark.parametrize("library", ["polars"]) +@pytest.mark.parametrize("library", ["polars", "pyarrow"]) def test_q1(benchmark: Any, library: str) -> None: - import polars as pl - lib_to_reader = { "polars": pl.scan_parquet, + "pyarrow": pq.read_table, } read_fn = lib_to_reader[library] From f140177b8b8715c750b7f0e983ec71d208a82fec Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:40:39 +0200 Subject: [PATCH 16/20] pin numpy --- .github/workflows/codspeed.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index b7549263c..3e7f78bd2 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -19,7 +19,8 @@ jobs: - name: Install uv run: curl -LsSf https://astral.sh/uv/install.sh | sh - name: Install dependencies - run: uv pip install -r requirements-dev.txt --system + run: | + uv pip install -r requirements-dev.txt "numpy<2.0.0" --system - name: show-deps run: uv pip freeze - name: Run benchmarks From 989a09e8349d6d701b684b2aa6b4500ceb694a4b Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 17:44:58 +0200 Subject: [PATCH 17/20] back to actual queries --- tpch/scripts/naive_test.py | 11 ----- tpch/scripts/q1_test.py | 6 ++- tpch/scripts/q2_test.py | 84 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 12 deletions(-) delete mode 100644 tpch/scripts/naive_test.py create mode 100644 tpch/scripts/q2_test.py diff --git a/tpch/scripts/naive_test.py b/tpch/scripts/naive_test.py deleted file mode 100644 index c175a657b..000000000 --- a/tpch/scripts/naive_test.py +++ /dev/null @@ -1,11 +0,0 @@ -from statistics import mean -from typing import Any - - -def test_mean_performance(benchmark: Any) -> None: - # Precompute some data useful for the benchmark but that should not be - # included in the benchmark time - data = [1, 2, 3, 4, 5] - - # Benchmark the execution of the function - benchmark(lambda: mean(data)) diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py index 262cb96b7..ea7e91811 100644 --- a/tpch/scripts/q1_test.py +++ b/tpch/scripts/q1_test.py @@ -3,6 +3,8 @@ from datetime import date from typing import Any +import dask.dataframe as dd +import pandas as pd import polars as pl import pyarrow.parquet as pq import pytest @@ -40,10 +42,12 @@ def q1(lineitem_ds: Any) -> Any: return query_result.collect() -@pytest.mark.parametrize("library", ["polars", "pyarrow"]) +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) def test_q1(benchmark: Any, library: str) -> None: lib_to_reader = { + "pandas": pd.read_parquet, "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), "pyarrow": pq.read_table, } diff --git a/tpch/scripts/q2_test.py b/tpch/scripts/q2_test.py new file mode 100644 index 000000000..7a6e62dbe --- /dev/null +++ b/tpch/scripts/q2_test.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import Any + +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq +import pytest + +import narwhals.stable.v1 as nw + + +def q2( + region_ds: Any, + nation_ds: Any, + supplier_ds: Any, + part_ds: Any, + part_supp_ds: Any, +) -> Any: + var_1 = 15 + var_2 = "BRASS" + var_3 = "EUROPE" + + tmp = ( + part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey") + .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") + .join(region_ds, left_on="n_regionkey", right_on="r_regionkey") + .filter( + nw.col("p_size") == var_1, + nw.col("p_type").str.ends_with(var_2), + nw.col("r_name") == var_3, + ) + ) + + final_cols = [ + "s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment", + ] + + query_result = ( + tmp.group_by("p_partkey") + .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) + .join( + tmp, + left_on=["p_partkey", "ps_supplycost"], + right_on=["p_partkey", "ps_supplycost"], + ) + .select(final_cols) + .sort( + ["s_acctbal", "n_name", "s_name", "p_partkey"], + descending=[True, False, False, False], + ) + .head(100) + ) + return query_result.collect() + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q2(benchmark: Any, library: str) -> None: + lib_to_reader = { + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pyarrow": pq.read_table, + } + + read_fn = lib_to_reader[library] + region_ds = nw.from_native(read_fn("tests/data/region.parquet")).lazy() + nation_ds = nw.from_native(read_fn("tests/data/nation.parquet")).lazy() + supplier_ds = nw.from_native(read_fn("tests/data/supplier.parquet")).lazy() + part_ds = nw.from_native(read_fn("tests/data/part.parquet")).lazy() + part_supp_ds = nw.from_native(read_fn("tests/data/partsupp.parquet")).lazy() + + args = (region_ds, nation_ds, supplier_ds, part_ds, part_supp_ds) + + _ = benchmark(q2, *args) From eeb73472a2a9ac776cb25e7a289fcd393399f1cc Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Mon, 19 Aug 2024 20:56:50 +0200 Subject: [PATCH 18/20] refactor --- .github/workflows/codspeed.yml | 2 +- {tests/tpch => tpch/benchmarks}/__init__.py | 0 tpch/benchmarks/queries.py | 120 ++++++++++++++++++++ tpch/benchmarks/queries_test.py | 35 ++++++ tpch/benchmarks/utils.py | 11 ++ tpch/scripts/__init__.py | 0 tpch/scripts/q1_test.py | 57 ---------- tpch/scripts/q2_test.py | 84 -------------- 8 files changed, 167 insertions(+), 142 deletions(-) rename {tests/tpch => tpch/benchmarks}/__init__.py (100%) create mode 100644 tpch/benchmarks/queries.py create mode 100644 tpch/benchmarks/queries_test.py create mode 100644 tpch/benchmarks/utils.py delete mode 100644 tpch/scripts/__init__.py delete mode 100644 tpch/scripts/q1_test.py delete mode 100644 tpch/scripts/q2_test.py diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 3e7f78bd2..a4daf42be 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -26,4 +26,4 @@ jobs: - name: Run benchmarks uses: CodSpeedHQ/action@v3 with: - run: pytest tpch/scripts --codspeed \ No newline at end of file + run: pytest tpch/benchmarks --codspeed \ No newline at end of file diff --git a/tests/tpch/__init__.py b/tpch/benchmarks/__init__.py similarity index 100% rename from tests/tpch/__init__.py rename to tpch/benchmarks/__init__.py diff --git a/tpch/benchmarks/queries.py b/tpch/benchmarks/queries.py new file mode 100644 index 000000000..fa3de4de4 --- /dev/null +++ b/tpch/benchmarks/queries.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from datetime import date + +import narwhals.stable.v1 as nw + + +def q1(lineitem: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1998, 9, 2) + query_result = ( + lineitem.filter(nw.col("l_shipdate") <= var_1) + .with_columns( + disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), + charge=( + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) + ), + ) + .group_by(["l_returnflag", "l_linestatus"]) + .agg( + [ + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + nw.col("disc_price").sum().alias("sum_disc_price"), + nw.col("charge").sum().alias("sum_charge"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), + ], + ) + .sort(["l_returnflag", "l_linestatus"]) + ) + return query_result.collect() + + +def q2( + region: nw.LazyFrame, + nation: nw.LazyFrame, + supplier: nw.LazyFrame, + part: nw.LazyFrame, + part_supp: nw.LazyFrame, +) -> nw.DataFrame: + var_1 = 15 + var_2 = "BRASS" + var_3 = "EUROPE" + + tmp = ( + part.join(part_supp, left_on="p_partkey", right_on="ps_partkey") + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(region, left_on="n_regionkey", right_on="r_regionkey") + .filter( + nw.col("p_size") == var_1, + nw.col("p_type").str.ends_with(var_2), + nw.col("r_name") == var_3, + ) + ) + + final_cols = [ + "s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment", + ] + + return ( + tmp.group_by("p_partkey") + .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) + .join( + tmp, + left_on=["p_partkey", "ps_supplycost"], + right_on=["p_partkey", "ps_supplycost"], + ) + .select(final_cols) + .sort( + ["s_acctbal", "n_name", "s_name", "p_partkey"], + descending=[True, False, False, False], + ) + .head(100) + .collect() + ) + + +def q3( + customer: nw.LazyFrame, line_item: nw.LazyFrame, orders: nw.LazyFrame +) -> nw.DataFrame: + var_1 = var_2 = date(1995, 3, 15) + var_3 = "BUILDING" + + return ( + customer.filter(nw.col("c_mktsegment") == var_3) + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(line_item, left_on="o_orderkey", right_on="l_orderkey") + .filter( + nw.col("o_orderdate") < var_2, + nw.col("l_shipdate") > var_1, + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by(["o_orderkey", "o_orderdate", "o_shippriority"]) + .agg([nw.sum("revenue")]) + .select( + [ + nw.col("o_orderkey").alias("l_orderkey"), + "revenue", + "o_orderdate", + "o_shippriority", + ] + ) + .sort(by=["revenue", "o_orderdate"], descending=[True, False]) + .head(10) + .collect() + ) diff --git a/tpch/benchmarks/queries_test.py b/tpch/benchmarks/queries_test.py new file mode 100644 index 000000000..efd743fa1 --- /dev/null +++ b/tpch/benchmarks/queries_test.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.queries import q1 +from tpch.benchmarks.queries import q2 +from tpch.benchmarks.queries import q3 +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_queries(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q1, lineitem) + _ = benchmark(q2, region, nation, supplier, part, partsupp) + _ = benchmark(q3, customer, lineitem, orders) diff --git a/tpch/benchmarks/utils.py b/tpch/benchmarks/utils.py new file mode 100644 index 000000000..d0e22e59b --- /dev/null +++ b/tpch/benchmarks/utils.py @@ -0,0 +1,11 @@ +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq + +lib_to_reader = { + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pandas": pd.read_parquet, + "polars": pl.scan_parquet, + "pyarrow": pq.read_table, +} diff --git a/tpch/scripts/__init__.py b/tpch/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tpch/scripts/q1_test.py b/tpch/scripts/q1_test.py deleted file mode 100644 index ea7e91811..000000000 --- a/tpch/scripts/q1_test.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -from datetime import date -from typing import Any - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq -import pytest - -import narwhals.stable.v1 as nw - - -def q1(lineitem_ds: Any) -> Any: - var_1 = date(1998, 9, 2) - query_result = ( - lineitem_ds.filter(nw.col("l_shipdate") <= var_1) - .with_columns( - disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), - charge=( - nw.col("l_extendedprice") - * (1.0 - nw.col("l_discount")) - * (1.0 + nw.col("l_tax")) - ), - ) - .group_by(["l_returnflag", "l_linestatus"]) - .agg( - [ - nw.col("l_quantity").sum().alias("sum_qty"), - nw.col("l_extendedprice").sum().alias("sum_base_price"), - nw.col("disc_price").sum().alias("sum_disc_price"), - nw.col("charge").sum().alias("sum_charge"), - nw.col("l_quantity").mean().alias("avg_qty"), - nw.col("l_extendedprice").mean().alias("avg_price"), - nw.col("l_discount").mean().alias("avg_disc"), - nw.len().alias("count_order"), - ], - ) - .sort(["l_returnflag", "l_linestatus"]) - ) - return query_result.collect() - - -@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q1(benchmark: Any, library: str) -> None: - lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, - } - - read_fn = lib_to_reader[library] - lineitem_ds = nw.from_native(read_fn("tests/data/lineitem.parquet")).lazy() - - _ = benchmark(q1, lineitem_ds) diff --git a/tpch/scripts/q2_test.py b/tpch/scripts/q2_test.py deleted file mode 100644 index 7a6e62dbe..000000000 --- a/tpch/scripts/q2_test.py +++ /dev/null @@ -1,84 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq -import pytest - -import narwhals.stable.v1 as nw - - -def q2( - region_ds: Any, - nation_ds: Any, - supplier_ds: Any, - part_ds: Any, - part_supp_ds: Any, -) -> Any: - var_1 = 15 - var_2 = "BRASS" - var_3 = "EUROPE" - - tmp = ( - part_ds.join(part_supp_ds, left_on="p_partkey", right_on="ps_partkey") - .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") - .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") - .join(region_ds, left_on="n_regionkey", right_on="r_regionkey") - .filter( - nw.col("p_size") == var_1, - nw.col("p_type").str.ends_with(var_2), - nw.col("r_name") == var_3, - ) - ) - - final_cols = [ - "s_acctbal", - "s_name", - "n_name", - "p_partkey", - "p_mfgr", - "s_address", - "s_phone", - "s_comment", - ] - - query_result = ( - tmp.group_by("p_partkey") - .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) - .join( - tmp, - left_on=["p_partkey", "ps_supplycost"], - right_on=["p_partkey", "ps_supplycost"], - ) - .select(final_cols) - .sort( - ["s_acctbal", "n_name", "s_name", "p_partkey"], - descending=[True, False, False, False], - ) - .head(100) - ) - return query_result.collect() - - -@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_q2(benchmark: Any, library: str) -> None: - lib_to_reader = { - "pandas": pd.read_parquet, - "polars": pl.scan_parquet, - "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pyarrow": pq.read_table, - } - - read_fn = lib_to_reader[library] - region_ds = nw.from_native(read_fn("tests/data/region.parquet")).lazy() - nation_ds = nw.from_native(read_fn("tests/data/nation.parquet")).lazy() - supplier_ds = nw.from_native(read_fn("tests/data/supplier.parquet")).lazy() - part_ds = nw.from_native(read_fn("tests/data/part.parquet")).lazy() - part_supp_ds = nw.from_native(read_fn("tests/data/partsupp.parquet")).lazy() - - args = (region_ds, nation_ds, supplier_ds, part_ds, part_supp_ds) - - _ = benchmark(q2, *args) From bd07b0e0c869d08a021ec0c3309197eb33b9e0e5 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 20 Aug 2024 14:55:29 +0200 Subject: [PATCH 19/20] let's try now :) --- tpch/benchmarks/queries_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tpch/benchmarks/queries_test.py b/tpch/benchmarks/queries_test.py index efd743fa1..f71c53183 100644 --- a/tpch/benchmarks/queries_test.py +++ b/tpch/benchmarks/queries_test.py @@ -30,6 +30,6 @@ def test_queries(benchmark: BenchmarkFixture, library: str) -> None: region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() - _ = benchmark(q1, lineitem) - _ = benchmark(q2, region, nation, supplier, part, partsupp) - _ = benchmark(q3, customer, lineitem, orders) + q1_result = benchmark(q1, lineitem) # noqa: F841 + q2_result = benchmark(q2, region, nation, supplier, part, partsupp) # noqa: F841 + q3_result = benchmark(q3, customer, lineitem, orders) # noqa: F841 From a1eee960a94cc03cfec97c36c9752b4fff6dd062 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 20 Aug 2024 17:18:34 +0200 Subject: [PATCH 20/20] all available queries --- tpch/benchmarks/q10_test.py | 71 +++++++++++++++++++ tpch/benchmarks/q11_test.py | 53 ++++++++++++++ tpch/benchmarks/q15_test.py | 54 ++++++++++++++ tpch/benchmarks/q17_test.py | 46 ++++++++++++ tpch/benchmarks/q18_test.py | 56 +++++++++++++++ tpch/benchmarks/q19_test.py | 61 ++++++++++++++++ tpch/benchmarks/q1_test.py | 52 ++++++++++++++ tpch/benchmarks/q20_test.py | 71 +++++++++++++++++++ tpch/benchmarks/q21_test.py | 63 +++++++++++++++++ tpch/benchmarks/q2_test.py | 79 +++++++++++++++++++++ tpch/benchmarks/q3_test.py | 59 ++++++++++++++++ tpch/benchmarks/q4_test.py | 49 +++++++++++++ tpch/benchmarks/q5_test.py | 65 +++++++++++++++++ tpch/benchmarks/q6_test.py | 43 ++++++++++++ tpch/benchmarks/q7_test.py | 79 +++++++++++++++++++++ tpch/benchmarks/q8_test.py | 76 ++++++++++++++++++++ tpch/benchmarks/q9_test.py | 66 ++++++++++++++++++ tpch/benchmarks/queries.py | 120 -------------------------------- tpch/benchmarks/queries_test.py | 35 ---------- tpch/benchmarks/utils.py | 2 +- 20 files changed, 1044 insertions(+), 156 deletions(-) create mode 100644 tpch/benchmarks/q10_test.py create mode 100644 tpch/benchmarks/q11_test.py create mode 100644 tpch/benchmarks/q15_test.py create mode 100644 tpch/benchmarks/q17_test.py create mode 100644 tpch/benchmarks/q18_test.py create mode 100644 tpch/benchmarks/q19_test.py create mode 100644 tpch/benchmarks/q1_test.py create mode 100644 tpch/benchmarks/q20_test.py create mode 100644 tpch/benchmarks/q21_test.py create mode 100644 tpch/benchmarks/q2_test.py create mode 100644 tpch/benchmarks/q3_test.py create mode 100644 tpch/benchmarks/q4_test.py create mode 100644 tpch/benchmarks/q5_test.py create mode 100644 tpch/benchmarks/q6_test.py create mode 100644 tpch/benchmarks/q7_test.py create mode 100644 tpch/benchmarks/q8_test.py create mode 100644 tpch/benchmarks/q9_test.py delete mode 100644 tpch/benchmarks/queries.py delete mode 100644 tpch/benchmarks/queries_test.py diff --git a/tpch/benchmarks/q10_test.py b/tpch/benchmarks/q10_test.py new file mode 100644 index 000000000..70fb9f924 --- /dev/null +++ b/tpch/benchmarks/q10_test.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q10(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q10, customer, nation, lineitem, orders) + + +def q10( + customer: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, +) -> nw.DataFrame: + var1 = datetime(1993, 10, 1) + var2 = datetime(1994, 1, 1) + + return ( + customer.join(orders, left_on="c_custkey", right_on="o_custkey") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(nation, left_on="c_nationkey", right_on="n_nationkey") + .filter(nw.col("o_orderdate").is_between(var1, var2, closed="left")) + .filter(nw.col("l_returnflag") == "R") + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by( + "c_custkey", + "c_name", + "c_acctbal", + "c_phone", + "n_name", + "c_address", + "c_comment", + ) + .agg(nw.sum("revenue")) + .select( + "c_custkey", + "c_name", + "revenue", + "c_acctbal", + "n_name", + "c_address", + "c_phone", + "c_comment", + ) + .sort(by="revenue", descending=True) + .head(20) + .collect() + ) diff --git a/tpch/benchmarks/q11_test.py b/tpch/benchmarks/q11_test.py new file mode 100644 index 000000000..e36af3afc --- /dev/null +++ b/tpch/benchmarks/q11_test.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q11(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q11, partsupp, nation, supplier) + + +def q11( + partsupp: nw.LazyFrame, nation: nw.LazyFrame, supplier: nw.LazyFrame +) -> nw.DataFrame: + var1 = "GERMANY" + var2 = 0.0001 + + q1 = ( + partsupp.join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("n_name") == var1) + ) + q2 = q1.select( + (nw.col("ps_supplycost") * nw.col("ps_availqty")).sum().round(2).alias("tmp") + * var2 + ) + + return ( + q1.with_columns((nw.col("ps_supplycost") * nw.col("ps_availqty")).alias("value")) + .group_by("ps_partkey") + .agg(nw.sum("value")) + .join(q2, how="cross") + .filter(nw.col("value") > nw.col("tmp")) + .select("ps_partkey", "value") + .sort("value", descending=True) + .collect() + ) diff --git a/tpch/benchmarks/q15_test.py b/tpch/benchmarks/q15_test.py new file mode 100644 index 000000000..88b4a540a --- /dev/null +++ b/tpch/benchmarks/q15_test.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q15(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q15, lineitem, supplier) + + +def q15( + lineitem: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = date(1996, 1, 1) + var2 = date(1996, 4, 1) + + revenue = ( + lineitem.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias( + "total_revenue" + ) + ) + .group_by("l_suppkey") + .agg(nw.sum("total_revenue")) + .select(nw.col("l_suppkey").alias("supplier_no"), nw.col("total_revenue")) + ) + + return ( + supplier.join(revenue, left_on="s_suppkey", right_on="supplier_no") + .filter(nw.col("total_revenue") == nw.col("total_revenue").max()) + .with_columns(nw.col("total_revenue").round(2)) + .select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") + .sort("s_suppkey") + .collect() + ) diff --git a/tpch/benchmarks/q17_test.py b/tpch/benchmarks/q17_test.py new file mode 100644 index 000000000..2e887ba3f --- /dev/null +++ b/tpch/benchmarks/q17_test.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q17(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + + _ = benchmark(q17, lineitem, part) + + +def q17(lineitem: nw.LazyFrame, part: nw.LazyFrame) -> nw.DataFrame: + var1 = "Brand#23" + var2 = "MED BOX" + + query1 = ( + part.filter(nw.col("p_brand") == var1) + .filter(nw.col("p_container") == var2) + .join(lineitem, how="left", left_on="p_partkey", right_on="l_partkey") + ) + + return ( + query1.with_columns(avg_quantity=0.2 * nw.col("l_quantity")) + .group_by("p_partkey") + .agg(nw.col("avg_quantity").mean()) + .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity")) + .join(query1, left_on="key", right_on="p_partkey") + .filter(nw.col("l_quantity") < nw.col("avg_quantity")) + .select((nw.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")) + .collect() + ) diff --git a/tpch/benchmarks/q18_test.py b/tpch/benchmarks/q18_test.py new file mode 100644 index 000000000..8b64efdc8 --- /dev/null +++ b/tpch/benchmarks/q18_test.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q18(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q18, customer, lineitem, orders) + + +def q18( + customer: nw.LazyFrame, lineitem: nw.LazyFrame, orders: nw.LazyFrame +) -> nw.DataFrame: + var1 = 300 + + query1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.col("l_quantity").sum().alias("sum_quantity")) + .filter(nw.col("sum_quantity") > var1) + ) + + return ( + orders.join(query1, left_on="o_orderkey", right_on="l_orderkey", how="semi") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(customer, left_on="o_custkey", right_on="c_custkey") + .group_by("c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice") + .agg(nw.col("l_quantity").sum().alias("col6")) + .select( + nw.col("c_name"), + nw.col("o_custkey").alias("c_custkey"), + nw.col("o_orderkey"), + nw.col("o_orderdate").alias("o_orderdat"), + nw.col("o_totalprice"), + nw.col("col6"), + ) + .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False]) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q19_test.py b/tpch/benchmarks/q19_test.py new file mode 100644 index 000000000..501d1f698 --- /dev/null +++ b/tpch/benchmarks/q19_test.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q19(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + + _ = benchmark(q19, lineitem, part) + + +def q19(lineitem: nw.LazyFrame, part: nw.LazyFrame) -> nw.DataFrame: + return ( + part.join(lineitem, left_on="p_partkey", right_on="l_partkey") + .filter(nw.col("l_shipmode").is_in(["AIR", "AIR REG"])) + .filter(nw.col("l_shipinstruct") == "DELIVER IN PERSON") + .filter( + ( + (nw.col("p_brand") == "Brand#12") + & nw.col("p_container").is_in(["SM CASE", "SM BOX", "SM PACK", "SM PKG"]) + & (nw.col("l_quantity").is_between(1, 11)) + & (nw.col("p_size").is_between(1, 5)) + ) + | ( + (nw.col("p_brand") == "Brand#23") + & nw.col("p_container").is_in( + ["MED BAG", "MED BOX", "MED PKG", "MED PACK"] + ) + & (nw.col("l_quantity").is_between(10, 20)) + & (nw.col("p_size").is_between(1, 10)) + ) + | ( + (nw.col("p_brand") == "Brand#34") + & nw.col("p_container").is_in(["LG CASE", "LG BOX", "LG PACK", "LG PKG"]) + & (nw.col("l_quantity").is_between(20, 30)) + & (nw.col("p_size").is_between(1, 15)) + ) + ) + .select( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) + .sum() + .round(2) + .alias("revenue") + ) + .collect() + ) diff --git a/tpch/benchmarks/q1_test.py b/tpch/benchmarks/q1_test.py new file mode 100644 index 000000000..d1f6a5441 --- /dev/null +++ b/tpch/benchmarks/q1_test.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q1(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + _ = benchmark(q1, lineitem) + + +def q1(lineitem: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1998, 9, 2) + return ( + lineitem.filter(nw.col("l_shipdate") <= var_1) + .with_columns( + disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), + charge=( + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) + ), + ) + .group_by(["l_returnflag", "l_linestatus"]) + .agg( + [ + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + nw.col("disc_price").sum().alias("sum_disc_price"), + nw.col("charge").sum().alias("sum_charge"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), + ], + ) + .sort(["l_returnflag", "l_linestatus"]) + .collect() + ) diff --git a/tpch/benchmarks/q20_test.py b/tpch/benchmarks/q20_test.py new file mode 100644 index 000000000..db1eb7ebe --- /dev/null +++ b/tpch/benchmarks/q20_test.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q20(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # requires unique + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q20, part, partsupp, nation, lineitem, supplier) + + +def q20( + part: nw.LazyFrame, + partsupp: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = date(1994, 1, 1) + var2 = date(1995, 1, 1) + var3 = "CANADA" + var4 = "forest" + + query1 = ( + lineitem.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .group_by("l_partkey", "l_suppkey") + .agg((nw.col("l_quantity").sum()).alias("sum_quantity")) + .with_columns(sum_quantity=nw.col("sum_quantity") * 0.5) + ) + query2 = nation.filter(nw.col("n_name") == var3) + query3 = supplier.join(query2, left_on="s_nationkey", right_on="n_nationkey") + + return ( + part.filter(nw.col("p_name").str.starts_with(var4)) + .select(nw.col("p_partkey").unique()) + .join(partsupp, left_on="p_partkey", right_on="ps_partkey") + .join( + query1, + left_on=["ps_suppkey", "p_partkey"], + right_on=["l_suppkey", "l_partkey"], + ) + .filter(nw.col("ps_availqty") > nw.col("sum_quantity")) + .select(nw.col("ps_suppkey").unique()) + .join(query3, left_on="ps_suppkey", right_on="s_suppkey") + .select("s_name", "s_address") + .sort("s_name") + .collect() + ) diff --git a/tpch/benchmarks/q21_test.py b/tpch/benchmarks/q21_test.py new file mode 100644 index 000000000..764e9ca26 --- /dev/null +++ b/tpch/benchmarks/q21_test.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q21(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q21, lineitem, nation, orders, supplier) + + +def q21( + lineitem: nw.LazyFrame, + nation: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = "SAUDI ARABIA" + + q1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .filter(nw.col("n_supp_by_order") > 1) + .join( + lineitem.filter(nw.col("l_receiptdate") > nw.col("l_commitdate")), + left_on="l_orderkey", + right_on="l_orderkey", + ) + ) + + return ( + q1.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .join(q1, left_on="l_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter(nw.col("n_supp_by_order") == 1) + .filter(nw.col("n_name") == var1) + .filter(nw.col("o_orderstatus") == "F") + .group_by("s_name") + .agg(nw.len().alias("numwait")) + .sort(by=["numwait", "s_name"], descending=[True, False]) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q2_test.py b/tpch/benchmarks/q2_test.py new file mode 100644 index 000000000..945b2dd58 --- /dev/null +++ b/tpch/benchmarks/q2_test.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q2(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q2, region, nation, supplier, part, partsupp) + + +def q2( + region: nw.LazyFrame, + nation: nw.LazyFrame, + supplier: nw.LazyFrame, + part: nw.LazyFrame, + part_supp: nw.LazyFrame, +) -> nw.DataFrame: + var_1 = 15 + var_2 = "BRASS" + var_3 = "EUROPE" + + tmp = ( + part.join(part_supp, left_on="p_partkey", right_on="ps_partkey") + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(region, left_on="n_regionkey", right_on="r_regionkey") + .filter( + nw.col("p_size") == var_1, + nw.col("p_type").str.ends_with(var_2), + nw.col("r_name") == var_3, + ) + ) + + final_cols = [ + "s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment", + ] + + return ( + tmp.group_by("p_partkey") + .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) + .join( + tmp, + left_on=["p_partkey", "ps_supplycost"], + right_on=["p_partkey", "ps_supplycost"], + ) + .select(final_cols) + .sort( + ["s_acctbal", "n_name", "s_name", "p_partkey"], + descending=[True, False, False, False], + ) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q3_test.py b/tpch/benchmarks/q3_test.py new file mode 100644 index 000000000..e487e4c84 --- /dev/null +++ b/tpch/benchmarks/q3_test.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q3(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q3, customer, lineitem, orders) + + +def q3( + customer: nw.LazyFrame, line_item: nw.LazyFrame, orders: nw.LazyFrame +) -> nw.DataFrame: + var_1 = var_2 = date(1995, 3, 15) + var_3 = "BUILDING" + + return ( + customer.filter(nw.col("c_mktsegment") == var_3) + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(line_item, left_on="o_orderkey", right_on="l_orderkey") + .filter( + nw.col("o_orderdate") < var_2, + nw.col("l_shipdate") > var_1, + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by(["o_orderkey", "o_orderdate", "o_shippriority"]) + .agg([nw.sum("revenue")]) + .select( + [ + nw.col("o_orderkey").alias("l_orderkey"), + "revenue", + "o_orderdate", + "o_shippriority", + ] + ) + .sort(by=["revenue", "o_orderdate"], descending=[True, False]) + .head(10) + .collect() + ) diff --git a/tpch/benchmarks/q4_test.py b/tpch/benchmarks/q4_test.py new file mode 100644 index 000000000..dfb9f85de --- /dev/null +++ b/tpch/benchmarks/q4_test.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q4(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # cast method is missing + request.applymarker(pytest.mark.xfail) + + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q4, lineitem, orders) + + +def q4(lineitem: nw.LazyFrame, orders: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1993, 7, 1) + var_2 = date(1993, 10, 1) + + return ( + lineitem.join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter( + nw.col("o_orderdate").is_between(var_1, var_2, closed="left"), + nw.col("l_commitdate") < nw.col("l_receiptdate"), + ) + .unique(subset=["o_orderpriority", "l_orderkey"]) + .group_by("o_orderpriority") + .agg(nw.len().alias("order_count")) + .sort(by="o_orderpriority") + .with_columns(nw.col("order_count").cast(nw.Int64)) + .collect() + ) diff --git a/tpch/benchmarks/q5_test.py b/tpch/benchmarks/q5_test.py new file mode 100644 index 000000000..371ed42b9 --- /dev/null +++ b/tpch/benchmarks/q5_test.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q5(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q5, region, nation, customer, lineitem, orders, supplier) + + +def q5( + region: nw.LazyFrame, + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var_1 = "ASIA" + var_2 = date(1994, 1, 1) + var_3 = date(1995, 1, 1) + + return ( + region.join(nation, left_on="r_regionkey", right_on="n_regionkey") + .join(customer, left_on="n_nationkey", right_on="c_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join( + supplier, + left_on=["l_suppkey", "n_nationkey"], + right_on=["s_suppkey", "s_nationkey"], + ) + .filter( + nw.col("r_name") == var_1, + nw.col("o_orderdate").is_between(var_2, var_3, closed="left"), + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by("n_name") + .agg([nw.sum("revenue")]) + .sort(by="revenue", descending=True) + .collect() + ) diff --git a/tpch/benchmarks/q6_test.py b/tpch/benchmarks/q6_test.py new file mode 100644 index 000000000..7500fdc72 --- /dev/null +++ b/tpch/benchmarks/q6_test.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q6(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + + _ = benchmark(q6, lineitem) + + +def q6(lineitem: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1994, 1, 1) + var_2 = date(1995, 1, 1) + var_3 = 24 + + line_item_ds = nw.from_native(lineitem) + + return ( + line_item_ds.filter( + nw.col("l_shipdate").is_between(var_1, var_2, closed="left"), + nw.col("l_discount").is_between(0.05, 0.07), + nw.col("l_quantity") < var_3, + ) + .with_columns((nw.col("l_extendedprice") * nw.col("l_discount")).alias("revenue")) + .select(nw.sum("revenue")) + .collect() + ) diff --git a/tpch/benchmarks/q7_test.py b/tpch/benchmarks/q7_test.py new file mode 100644 index 000000000..7da0b105e --- /dev/null +++ b/tpch/benchmarks/q7_test.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q7(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # Dasknamespace does not implement concat + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q7, nation, customer, lineitem, orders, supplier) + + +def q7( + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + n1 = nation.filter(nw.col("n_name") == "FRANCE") + n2 = nation.filter(nw.col("n_name") == "GERMANY") + + var_1 = datetime(1995, 1, 1) + var_2 = datetime(1996, 12, 31) + + df1 = ( + customer.join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + df2 = ( + customer.join(n2, left_on="c_nationkey", right_on="n_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(n1, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + return ( + nw.concat([df1, df2]) + .filter(nw.col("l_shipdate").cast(nw.Datetime).is_between(var_1, var_2)) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume") + ) + .with_columns(nw.col("l_shipdate").cast(nw.Datetime).dt.year().alias("l_year")) + .group_by("supp_nation", "cust_nation", "l_year") + .agg(nw.sum("volume").alias("revenue")) + .sort(by=["supp_nation", "cust_nation", "l_year"]) + .collect() + ) diff --git a/tpch/benchmarks/q8_test.py b/tpch/benchmarks/q8_test.py new file mode 100644 index 000000000..64df8eff2 --- /dev/null +++ b/tpch/benchmarks/q8_test.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q8(benchmark: BenchmarkFixture, library: str) -> None: + # Requires nw.when to be implemented first + return + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q8, nation, customer, lineitem, orders, supplier, part, region) + + +def q8( + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, + part: nw.LazyFrame, + region: nw.LazyFrame, +) -> None: + n1 = nation.select("n_nationkey", "n_regionkey") + n2 = nation.select("n_nationkey", "n_name") + + return ( + part.join(lineitem, left_on="p_partkey", right_on="l_partkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .join(customer, left_on="o_custkey", right_on="c_custkey") + .join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(region, left_on="n_regionkey", right_on="r_regionkey") + .filter(nw.col("r_name") == "AMERICA") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .filter( + nw.col("o_orderdate") >= date(1995, 1, 1), + nw.col("o_orderdate") <= date(1996, 12, 31), + ) + .filter(nw.col("p_type") == "ECONOMY ANODIZED STEEL") + .select( + nw.col("o_orderdate").dt.year().alias("o_year"), + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume"), + nw.col("n_name").alias("nation"), + ) + .with_columns( + nw.when(nw.col("nation") == "BRAZIL") + .then(nw.col("volume")) + .otherwise(0) + .alias("_tmp") + ) + .group_by("o_year") + .agg((nw.sum("_tmp") / nw.sum("volume")).round(2).alias("mkt_share")) + .sort("o_year") + .collect() + ) diff --git a/tpch/benchmarks/q9_test.py b/tpch/benchmarks/q9_test.py new file mode 100644 index 000000000..29f1c1b58 --- /dev/null +++ b/tpch/benchmarks/q9_test.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q9(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # Requires cast method + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q9, part, partsupp, nation, lineitem, orders, supplier) + + +def q9( + part: nw.LazyFrame, + partsupp: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + return ( + part.join(partsupp, left_on="p_partkey", right_on="ps_partkey") + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join( + lineitem, + left_on=["p_partkey", "ps_suppkey"], + right_on=["l_partkey", "l_suppkey"], + ) + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("p_name").str.contains("green")) + .select( + nw.col("n_name").alias("nation"), + nw.col("o_orderdate").cast(nw.Datetime).dt.year().alias("o_year"), + ( + nw.col("l_extendedprice") * (1 - nw.col("l_discount")) + - nw.col("ps_supplycost") * nw.col("l_quantity") + ).alias("amount"), + ) + .group_by("nation", "o_year") + .agg(nw.sum("amount").alias("sum_profit")) + .sort(by=["nation", "o_year"], descending=[False, True]) + .collect() + ) diff --git a/tpch/benchmarks/queries.py b/tpch/benchmarks/queries.py deleted file mode 100644 index fa3de4de4..000000000 --- a/tpch/benchmarks/queries.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import annotations - -from datetime import date - -import narwhals.stable.v1 as nw - - -def q1(lineitem: nw.LazyFrame) -> nw.DataFrame: - var_1 = date(1998, 9, 2) - query_result = ( - lineitem.filter(nw.col("l_shipdate") <= var_1) - .with_columns( - disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), - charge=( - nw.col("l_extendedprice") - * (1.0 - nw.col("l_discount")) - * (1.0 + nw.col("l_tax")) - ), - ) - .group_by(["l_returnflag", "l_linestatus"]) - .agg( - [ - nw.col("l_quantity").sum().alias("sum_qty"), - nw.col("l_extendedprice").sum().alias("sum_base_price"), - nw.col("disc_price").sum().alias("sum_disc_price"), - nw.col("charge").sum().alias("sum_charge"), - nw.col("l_quantity").mean().alias("avg_qty"), - nw.col("l_extendedprice").mean().alias("avg_price"), - nw.col("l_discount").mean().alias("avg_disc"), - nw.len().alias("count_order"), - ], - ) - .sort(["l_returnflag", "l_linestatus"]) - ) - return query_result.collect() - - -def q2( - region: nw.LazyFrame, - nation: nw.LazyFrame, - supplier: nw.LazyFrame, - part: nw.LazyFrame, - part_supp: nw.LazyFrame, -) -> nw.DataFrame: - var_1 = 15 - var_2 = "BRASS" - var_3 = "EUROPE" - - tmp = ( - part.join(part_supp, left_on="p_partkey", right_on="ps_partkey") - .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") - .join(nation, left_on="s_nationkey", right_on="n_nationkey") - .join(region, left_on="n_regionkey", right_on="r_regionkey") - .filter( - nw.col("p_size") == var_1, - nw.col("p_type").str.ends_with(var_2), - nw.col("r_name") == var_3, - ) - ) - - final_cols = [ - "s_acctbal", - "s_name", - "n_name", - "p_partkey", - "p_mfgr", - "s_address", - "s_phone", - "s_comment", - ] - - return ( - tmp.group_by("p_partkey") - .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) - .join( - tmp, - left_on=["p_partkey", "ps_supplycost"], - right_on=["p_partkey", "ps_supplycost"], - ) - .select(final_cols) - .sort( - ["s_acctbal", "n_name", "s_name", "p_partkey"], - descending=[True, False, False, False], - ) - .head(100) - .collect() - ) - - -def q3( - customer: nw.LazyFrame, line_item: nw.LazyFrame, orders: nw.LazyFrame -) -> nw.DataFrame: - var_1 = var_2 = date(1995, 3, 15) - var_3 = "BUILDING" - - return ( - customer.filter(nw.col("c_mktsegment") == var_3) - .join(orders, left_on="c_custkey", right_on="o_custkey") - .join(line_item, left_on="o_orderkey", right_on="l_orderkey") - .filter( - nw.col("o_orderdate") < var_2, - nw.col("l_shipdate") > var_1, - ) - .with_columns( - (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") - ) - .group_by(["o_orderkey", "o_orderdate", "o_shippriority"]) - .agg([nw.sum("revenue")]) - .select( - [ - nw.col("o_orderkey").alias("l_orderkey"), - "revenue", - "o_orderdate", - "o_shippriority", - ] - ) - .sort(by=["revenue", "o_orderdate"], descending=[True, False]) - .head(10) - .collect() - ) diff --git a/tpch/benchmarks/queries_test.py b/tpch/benchmarks/queries_test.py deleted file mode 100644 index f71c53183..000000000 --- a/tpch/benchmarks/queries_test.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import TYPE_CHECKING - -import pytest - -import narwhals.stable.v1 as nw -from tpch.benchmarks.queries import q1 -from tpch.benchmarks.queries import q2 -from tpch.benchmarks.queries import q3 -from tpch.benchmarks.utils import lib_to_reader - -if TYPE_CHECKING: - from pytest_codspeed.plugin import BenchmarkFixture - -DATA_FOLDER = Path("tests/data") - - -@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) -def test_queries(benchmark: BenchmarkFixture, library: str) -> None: - read_fn = lib_to_reader[library] - - customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() - lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() - nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() - orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() - part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() - partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() - region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() - supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() - - q1_result = benchmark(q1, lineitem) # noqa: F841 - q2_result = benchmark(q2, region, nation, supplier, part, partsupp) # noqa: F841 - q3_result = benchmark(q3, customer, lineitem, orders) # noqa: F841 diff --git a/tpch/benchmarks/utils.py b/tpch/benchmarks/utils.py index d0e22e59b..629f2a1be 100644 --- a/tpch/benchmarks/utils.py +++ b/tpch/benchmarks/utils.py @@ -5,7 +5,7 @@ lib_to_reader = { "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), - "pandas": pd.read_parquet, + "pandas": lambda path: pd.read_parquet(path, engine="pyarrow"), "polars": pl.scan_parquet, "pyarrow": pq.read_table, }