diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 000000000..a4daf42be --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,29 @@ +name: codspeed benchmarks + +on: + pull_request: + workflow_dispatch: + push: + branches: + - main + +jobs: + codspeed-benchmarks: + name: codspeed benchmarks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: Install dependencies + run: | + uv pip install -r requirements-dev.txt "numpy<2.0.0" --system + - name: show-deps + run: uv pip freeze + - name: Run benchmarks + uses: CodSpeedHQ/action@v3 + with: + run: pytest tpch/benchmarks --codspeed \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 2424d4ea1..baf54fcf4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ polars pre-commit pyarrow pytest +pytest-codspeed pytest-cov pytest-env hypothesis diff --git a/tpch/benchmarks/__init__.py b/tpch/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tpch/benchmarks/q10_test.py b/tpch/benchmarks/q10_test.py new file mode 100644 index 000000000..70fb9f924 --- /dev/null +++ b/tpch/benchmarks/q10_test.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q10(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q10, customer, nation, lineitem, orders) + + +def q10( + customer: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, +) -> nw.DataFrame: + var1 = datetime(1993, 10, 1) + var2 = datetime(1994, 1, 1) + + return ( + customer.join(orders, left_on="c_custkey", right_on="o_custkey") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(nation, left_on="c_nationkey", right_on="n_nationkey") + .filter(nw.col("o_orderdate").is_between(var1, var2, closed="left")) + .filter(nw.col("l_returnflag") == "R") + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by( + "c_custkey", + "c_name", + "c_acctbal", + "c_phone", + "n_name", + "c_address", + "c_comment", + ) + .agg(nw.sum("revenue")) + .select( + "c_custkey", + "c_name", + "revenue", + "c_acctbal", + "n_name", + "c_address", + "c_phone", + "c_comment", + ) + .sort(by="revenue", descending=True) + .head(20) + .collect() + ) diff --git a/tpch/benchmarks/q11_test.py b/tpch/benchmarks/q11_test.py new file mode 100644 index 000000000..e36af3afc --- /dev/null +++ b/tpch/benchmarks/q11_test.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q11(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q11, partsupp, nation, supplier) + + +def q11( + partsupp: nw.LazyFrame, nation: nw.LazyFrame, supplier: nw.LazyFrame +) -> nw.DataFrame: + var1 = "GERMANY" + var2 = 0.0001 + + q1 = ( + partsupp.join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("n_name") == var1) + ) + q2 = q1.select( + (nw.col("ps_supplycost") * nw.col("ps_availqty")).sum().round(2).alias("tmp") + * var2 + ) + + return ( + q1.with_columns((nw.col("ps_supplycost") * nw.col("ps_availqty")).alias("value")) + .group_by("ps_partkey") + .agg(nw.sum("value")) + .join(q2, how="cross") + .filter(nw.col("value") > nw.col("tmp")) + .select("ps_partkey", "value") + .sort("value", descending=True) + .collect() + ) diff --git a/tpch/benchmarks/q15_test.py b/tpch/benchmarks/q15_test.py new file mode 100644 index 000000000..88b4a540a --- /dev/null +++ b/tpch/benchmarks/q15_test.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q15(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q15, lineitem, supplier) + + +def q15( + lineitem: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = date(1996, 1, 1) + var2 = date(1996, 4, 1) + + revenue = ( + lineitem.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias( + "total_revenue" + ) + ) + .group_by("l_suppkey") + .agg(nw.sum("total_revenue")) + .select(nw.col("l_suppkey").alias("supplier_no"), nw.col("total_revenue")) + ) + + return ( + supplier.join(revenue, left_on="s_suppkey", right_on="supplier_no") + .filter(nw.col("total_revenue") == nw.col("total_revenue").max()) + .with_columns(nw.col("total_revenue").round(2)) + .select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") + .sort("s_suppkey") + .collect() + ) diff --git a/tpch/benchmarks/q17_test.py b/tpch/benchmarks/q17_test.py new file mode 100644 index 000000000..2e887ba3f --- /dev/null +++ b/tpch/benchmarks/q17_test.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q17(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + + _ = benchmark(q17, lineitem, part) + + +def q17(lineitem: nw.LazyFrame, part: nw.LazyFrame) -> nw.DataFrame: + var1 = "Brand#23" + var2 = "MED BOX" + + query1 = ( + part.filter(nw.col("p_brand") == var1) + .filter(nw.col("p_container") == var2) + .join(lineitem, how="left", left_on="p_partkey", right_on="l_partkey") + ) + + return ( + query1.with_columns(avg_quantity=0.2 * nw.col("l_quantity")) + .group_by("p_partkey") + .agg(nw.col("avg_quantity").mean()) + .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity")) + .join(query1, left_on="key", right_on="p_partkey") + .filter(nw.col("l_quantity") < nw.col("avg_quantity")) + .select((nw.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")) + .collect() + ) diff --git a/tpch/benchmarks/q18_test.py b/tpch/benchmarks/q18_test.py new file mode 100644 index 000000000..8b64efdc8 --- /dev/null +++ b/tpch/benchmarks/q18_test.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q18(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q18, customer, lineitem, orders) + + +def q18( + customer: nw.LazyFrame, lineitem: nw.LazyFrame, orders: nw.LazyFrame +) -> nw.DataFrame: + var1 = 300 + + query1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.col("l_quantity").sum().alias("sum_quantity")) + .filter(nw.col("sum_quantity") > var1) + ) + + return ( + orders.join(query1, left_on="o_orderkey", right_on="l_orderkey", how="semi") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(customer, left_on="o_custkey", right_on="c_custkey") + .group_by("c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice") + .agg(nw.col("l_quantity").sum().alias("col6")) + .select( + nw.col("c_name"), + nw.col("o_custkey").alias("c_custkey"), + nw.col("o_orderkey"), + nw.col("o_orderdate").alias("o_orderdat"), + nw.col("o_totalprice"), + nw.col("col6"), + ) + .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False]) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q19_test.py b/tpch/benchmarks/q19_test.py new file mode 100644 index 000000000..501d1f698 --- /dev/null +++ b/tpch/benchmarks/q19_test.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q19(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + + _ = benchmark(q19, lineitem, part) + + +def q19(lineitem: nw.LazyFrame, part: nw.LazyFrame) -> nw.DataFrame: + return ( + part.join(lineitem, left_on="p_partkey", right_on="l_partkey") + .filter(nw.col("l_shipmode").is_in(["AIR", "AIR REG"])) + .filter(nw.col("l_shipinstruct") == "DELIVER IN PERSON") + .filter( + ( + (nw.col("p_brand") == "Brand#12") + & nw.col("p_container").is_in(["SM CASE", "SM BOX", "SM PACK", "SM PKG"]) + & (nw.col("l_quantity").is_between(1, 11)) + & (nw.col("p_size").is_between(1, 5)) + ) + | ( + (nw.col("p_brand") == "Brand#23") + & nw.col("p_container").is_in( + ["MED BAG", "MED BOX", "MED PKG", "MED PACK"] + ) + & (nw.col("l_quantity").is_between(10, 20)) + & (nw.col("p_size").is_between(1, 10)) + ) + | ( + (nw.col("p_brand") == "Brand#34") + & nw.col("p_container").is_in(["LG CASE", "LG BOX", "LG PACK", "LG PKG"]) + & (nw.col("l_quantity").is_between(20, 30)) + & (nw.col("p_size").is_between(1, 15)) + ) + ) + .select( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) + .sum() + .round(2) + .alias("revenue") + ) + .collect() + ) diff --git a/tpch/benchmarks/q1_test.py b/tpch/benchmarks/q1_test.py new file mode 100644 index 000000000..d1f6a5441 --- /dev/null +++ b/tpch/benchmarks/q1_test.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q1(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + _ = benchmark(q1, lineitem) + + +def q1(lineitem: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1998, 9, 2) + return ( + lineitem.filter(nw.col("l_shipdate") <= var_1) + .with_columns( + disc_price=nw.col("l_extendedprice") * (1 - nw.col("l_discount")), + charge=( + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) + ), + ) + .group_by(["l_returnflag", "l_linestatus"]) + .agg( + [ + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + nw.col("disc_price").sum().alias("sum_disc_price"), + nw.col("charge").sum().alias("sum_charge"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), + ], + ) + .sort(["l_returnflag", "l_linestatus"]) + .collect() + ) diff --git a/tpch/benchmarks/q20_test.py b/tpch/benchmarks/q20_test.py new file mode 100644 index 000000000..db1eb7ebe --- /dev/null +++ b/tpch/benchmarks/q20_test.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q20(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # requires unique + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q20, part, partsupp, nation, lineitem, supplier) + + +def q20( + part: nw.LazyFrame, + partsupp: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = date(1994, 1, 1) + var2 = date(1995, 1, 1) + var3 = "CANADA" + var4 = "forest" + + query1 = ( + lineitem.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .group_by("l_partkey", "l_suppkey") + .agg((nw.col("l_quantity").sum()).alias("sum_quantity")) + .with_columns(sum_quantity=nw.col("sum_quantity") * 0.5) + ) + query2 = nation.filter(nw.col("n_name") == var3) + query3 = supplier.join(query2, left_on="s_nationkey", right_on="n_nationkey") + + return ( + part.filter(nw.col("p_name").str.starts_with(var4)) + .select(nw.col("p_partkey").unique()) + .join(partsupp, left_on="p_partkey", right_on="ps_partkey") + .join( + query1, + left_on=["ps_suppkey", "p_partkey"], + right_on=["l_suppkey", "l_partkey"], + ) + .filter(nw.col("ps_availqty") > nw.col("sum_quantity")) + .select(nw.col("ps_suppkey").unique()) + .join(query3, left_on="ps_suppkey", right_on="s_suppkey") + .select("s_name", "s_address") + .sort("s_name") + .collect() + ) diff --git a/tpch/benchmarks/q21_test.py b/tpch/benchmarks/q21_test.py new file mode 100644 index 000000000..764e9ca26 --- /dev/null +++ b/tpch/benchmarks/q21_test.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q21(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q21, lineitem, nation, orders, supplier) + + +def q21( + lineitem: nw.LazyFrame, + nation: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var1 = "SAUDI ARABIA" + + q1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .filter(nw.col("n_supp_by_order") > 1) + .join( + lineitem.filter(nw.col("l_receiptdate") > nw.col("l_commitdate")), + left_on="l_orderkey", + right_on="l_orderkey", + ) + ) + + return ( + q1.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .join(q1, left_on="l_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter(nw.col("n_supp_by_order") == 1) + .filter(nw.col("n_name") == var1) + .filter(nw.col("o_orderstatus") == "F") + .group_by("s_name") + .agg(nw.len().alias("numwait")) + .sort(by=["numwait", "s_name"], descending=[True, False]) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q2_test.py b/tpch/benchmarks/q2_test.py new file mode 100644 index 000000000..945b2dd58 --- /dev/null +++ b/tpch/benchmarks/q2_test.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q2(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q2, region, nation, supplier, part, partsupp) + + +def q2( + region: nw.LazyFrame, + nation: nw.LazyFrame, + supplier: nw.LazyFrame, + part: nw.LazyFrame, + part_supp: nw.LazyFrame, +) -> nw.DataFrame: + var_1 = 15 + var_2 = "BRASS" + var_3 = "EUROPE" + + tmp = ( + part.join(part_supp, left_on="p_partkey", right_on="ps_partkey") + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(region, left_on="n_regionkey", right_on="r_regionkey") + .filter( + nw.col("p_size") == var_1, + nw.col("p_type").str.ends_with(var_2), + nw.col("r_name") == var_3, + ) + ) + + final_cols = [ + "s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment", + ] + + return ( + tmp.group_by("p_partkey") + .agg(nw.col("ps_supplycost").min().alias("ps_supplycost")) + .join( + tmp, + left_on=["p_partkey", "ps_supplycost"], + right_on=["p_partkey", "ps_supplycost"], + ) + .select(final_cols) + .sort( + ["s_acctbal", "n_name", "s_name", "p_partkey"], + descending=[True, False, False, False], + ) + .head(100) + .collect() + ) diff --git a/tpch/benchmarks/q3_test.py b/tpch/benchmarks/q3_test.py new file mode 100644 index 000000000..e487e4c84 --- /dev/null +++ b/tpch/benchmarks/q3_test.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q3(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q3, customer, lineitem, orders) + + +def q3( + customer: nw.LazyFrame, line_item: nw.LazyFrame, orders: nw.LazyFrame +) -> nw.DataFrame: + var_1 = var_2 = date(1995, 3, 15) + var_3 = "BUILDING" + + return ( + customer.filter(nw.col("c_mktsegment") == var_3) + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(line_item, left_on="o_orderkey", right_on="l_orderkey") + .filter( + nw.col("o_orderdate") < var_2, + nw.col("l_shipdate") > var_1, + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by(["o_orderkey", "o_orderdate", "o_shippriority"]) + .agg([nw.sum("revenue")]) + .select( + [ + nw.col("o_orderkey").alias("l_orderkey"), + "revenue", + "o_orderdate", + "o_shippriority", + ] + ) + .sort(by=["revenue", "o_orderdate"], descending=[True, False]) + .head(10) + .collect() + ) diff --git a/tpch/benchmarks/q4_test.py b/tpch/benchmarks/q4_test.py new file mode 100644 index 000000000..dfb9f85de --- /dev/null +++ b/tpch/benchmarks/q4_test.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q4(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # cast method is missing + request.applymarker(pytest.mark.xfail) + + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + + _ = benchmark(q4, lineitem, orders) + + +def q4(lineitem: nw.LazyFrame, orders: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1993, 7, 1) + var_2 = date(1993, 10, 1) + + return ( + lineitem.join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter( + nw.col("o_orderdate").is_between(var_1, var_2, closed="left"), + nw.col("l_commitdate") < nw.col("l_receiptdate"), + ) + .unique(subset=["o_orderpriority", "l_orderkey"]) + .group_by("o_orderpriority") + .agg(nw.len().alias("order_count")) + .sort(by="o_orderpriority") + .with_columns(nw.col("order_count").cast(nw.Int64)) + .collect() + ) diff --git a/tpch/benchmarks/q5_test.py b/tpch/benchmarks/q5_test.py new file mode 100644 index 000000000..371ed42b9 --- /dev/null +++ b/tpch/benchmarks/q5_test.py @@ -0,0 +1,65 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q5(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q5, region, nation, customer, lineitem, orders, supplier) + + +def q5( + region: nw.LazyFrame, + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + var_1 = "ASIA" + var_2 = date(1994, 1, 1) + var_3 = date(1995, 1, 1) + + return ( + region.join(nation, left_on="r_regionkey", right_on="n_regionkey") + .join(customer, left_on="n_nationkey", right_on="c_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join( + supplier, + left_on=["l_suppkey", "n_nationkey"], + right_on=["s_suppkey", "s_nationkey"], + ) + .filter( + nw.col("r_name") == var_1, + nw.col("o_orderdate").is_between(var_2, var_3, closed="left"), + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by("n_name") + .agg([nw.sum("revenue")]) + .sort(by="revenue", descending=True) + .collect() + ) diff --git a/tpch/benchmarks/q6_test.py b/tpch/benchmarks/q6_test.py new file mode 100644 index 000000000..7500fdc72 --- /dev/null +++ b/tpch/benchmarks/q6_test.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q6(benchmark: BenchmarkFixture, library: str) -> None: + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + + _ = benchmark(q6, lineitem) + + +def q6(lineitem: nw.LazyFrame) -> nw.DataFrame: + var_1 = date(1994, 1, 1) + var_2 = date(1995, 1, 1) + var_3 = 24 + + line_item_ds = nw.from_native(lineitem) + + return ( + line_item_ds.filter( + nw.col("l_shipdate").is_between(var_1, var_2, closed="left"), + nw.col("l_discount").is_between(0.05, 0.07), + nw.col("l_quantity") < var_3, + ) + .with_columns((nw.col("l_extendedprice") * nw.col("l_discount")).alias("revenue")) + .select(nw.sum("revenue")) + .collect() + ) diff --git a/tpch/benchmarks/q7_test.py b/tpch/benchmarks/q7_test.py new file mode 100644 index 000000000..7da0b105e --- /dev/null +++ b/tpch/benchmarks/q7_test.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q7(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # Dasknamespace does not implement concat + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q7, nation, customer, lineitem, orders, supplier) + + +def q7( + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + n1 = nation.filter(nw.col("n_name") == "FRANCE") + n2 = nation.filter(nw.col("n_name") == "GERMANY") + + var_1 = datetime(1995, 1, 1) + var_2 = datetime(1996, 12, 31) + + df1 = ( + customer.join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + df2 = ( + customer.join(n2, left_on="c_nationkey", right_on="n_nationkey") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(n1, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + return ( + nw.concat([df1, df2]) + .filter(nw.col("l_shipdate").cast(nw.Datetime).is_between(var_1, var_2)) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume") + ) + .with_columns(nw.col("l_shipdate").cast(nw.Datetime).dt.year().alias("l_year")) + .group_by("supp_nation", "cust_nation", "l_year") + .agg(nw.sum("volume").alias("revenue")) + .sort(by=["supp_nation", "cust_nation", "l_year"]) + .collect() + ) diff --git a/tpch/benchmarks/q8_test.py b/tpch/benchmarks/q8_test.py new file mode 100644 index 000000000..64df8eff2 --- /dev/null +++ b/tpch/benchmarks/q8_test.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from datetime import date +from pathlib import Path +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q8(benchmark: BenchmarkFixture, library: str) -> None: + # Requires nw.when to be implemented first + return + read_fn = lib_to_reader[library] + + customer = nw.from_native(read_fn(DATA_FOLDER / "customer.parquet")).lazy() + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + region = nw.from_native(read_fn(DATA_FOLDER / "region.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q8, nation, customer, lineitem, orders, supplier, part, region) + + +def q8( + nation: nw.LazyFrame, + customer: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, + part: nw.LazyFrame, + region: nw.LazyFrame, +) -> None: + n1 = nation.select("n_nationkey", "n_regionkey") + n2 = nation.select("n_nationkey", "n_name") + + return ( + part.join(lineitem, left_on="p_partkey", right_on="l_partkey") + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .join(customer, left_on="o_custkey", right_on="c_custkey") + .join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(region, left_on="n_regionkey", right_on="r_regionkey") + .filter(nw.col("r_name") == "AMERICA") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .filter( + nw.col("o_orderdate") >= date(1995, 1, 1), + nw.col("o_orderdate") <= date(1996, 12, 31), + ) + .filter(nw.col("p_type") == "ECONOMY ANODIZED STEEL") + .select( + nw.col("o_orderdate").dt.year().alias("o_year"), + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume"), + nw.col("n_name").alias("nation"), + ) + .with_columns( + nw.when(nw.col("nation") == "BRAZIL") + .then(nw.col("volume")) + .otherwise(0) + .alias("_tmp") + ) + .group_by("o_year") + .agg((nw.sum("_tmp") / nw.sum("volume")).round(2).alias("mkt_share")) + .sort("o_year") + .collect() + ) diff --git a/tpch/benchmarks/q9_test.py b/tpch/benchmarks/q9_test.py new file mode 100644 index 000000000..29f1c1b58 --- /dev/null +++ b/tpch/benchmarks/q9_test.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tpch.benchmarks.utils import lib_to_reader + +if TYPE_CHECKING: + from pytest_codspeed.plugin import BenchmarkFixture + +DATA_FOLDER = Path("tests/data") + + +@pytest.mark.parametrize("library", ["pandas", "polars", "pyarrow", "dask"]) +def test_q9(benchmark: BenchmarkFixture, library: str, request: Any) -> None: + if library == "dask": + # Requires cast method + request.applymarker(pytest.mark.xfail) + read_fn = lib_to_reader[library] + + lineitem = nw.from_native(read_fn(DATA_FOLDER / "lineitem.parquet")).lazy() + nation = nw.from_native(read_fn(DATA_FOLDER / "nation.parquet")).lazy() + orders = nw.from_native(read_fn(DATA_FOLDER / "orders.parquet")).lazy() + part = nw.from_native(read_fn(DATA_FOLDER / "part.parquet")).lazy() + partsupp = nw.from_native(read_fn(DATA_FOLDER / "partsupp.parquet")).lazy() + supplier = nw.from_native(read_fn(DATA_FOLDER / "supplier.parquet")).lazy() + + _ = benchmark(q9, part, partsupp, nation, lineitem, orders, supplier) + + +def q9( + part: nw.LazyFrame, + partsupp: nw.LazyFrame, + nation: nw.LazyFrame, + lineitem: nw.LazyFrame, + orders: nw.LazyFrame, + supplier: nw.LazyFrame, +) -> nw.DataFrame: + return ( + part.join(partsupp, left_on="p_partkey", right_on="ps_partkey") + .join(supplier, left_on="ps_suppkey", right_on="s_suppkey") + .join( + lineitem, + left_on=["p_partkey", "ps_suppkey"], + right_on=["l_partkey", "l_suppkey"], + ) + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("p_name").str.contains("green")) + .select( + nw.col("n_name").alias("nation"), + nw.col("o_orderdate").cast(nw.Datetime).dt.year().alias("o_year"), + ( + nw.col("l_extendedprice") * (1 - nw.col("l_discount")) + - nw.col("ps_supplycost") * nw.col("l_quantity") + ).alias("amount"), + ) + .group_by("nation", "o_year") + .agg(nw.sum("amount").alias("sum_profit")) + .sort(by=["nation", "o_year"], descending=[False, True]) + .collect() + ) diff --git a/tpch/benchmarks/utils.py b/tpch/benchmarks/utils.py new file mode 100644 index 000000000..629f2a1be --- /dev/null +++ b/tpch/benchmarks/utils.py @@ -0,0 +1,11 @@ +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq + +lib_to_reader = { + "dask": lambda path: dd.read_parquet(path, dtype_backend="pyarrow"), + "pandas": lambda path: pd.read_parquet(path, engine="pyarrow"), + "polars": pl.scan_parquet, + "pyarrow": pq.read_table, +}