diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py new file mode 100644 index 0000000000..f9d886aad4 --- /dev/null +++ b/tpch/execute/q3.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pandas as pd +import polars as pl +from queries import q3 + +pd.options.mode.copy_on_write = True +pd.options.future.infer_string = True + +customer = Path("data") / "customer.parquet" +line_item = Path("data") / "lineitem.parquet" +orders = Path("data") / "orders.parquet" + +IO_FUNCS = { + "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), + "pandas[pyarrow]": lambda x: pd.read_parquet( + x, engine="pyarrow", dtype_backend="pyarrow" + ), + "polars[eager]": lambda x: pl.read_parquet(x), + "polars[lazy]": lambda x: pl.scan_parquet(x), +} + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q3.query(fn(customer), fn(line_item), fn(orders))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q3.query(fn(customer), fn(line_item), fn(orders))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q3.query(fn(customer), fn(line_item), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q3.query(fn(customer), fn(line_item), fn(orders)).collect()) diff --git a/tpch/queries/q3.py b/tpch/queries/q3.py new file mode 100644 index 0000000000..e55f4a33df --- /dev/null +++ b/tpch/queries/q3.py @@ -0,0 +1,45 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + customer_ds_raw: FrameT, + line_item_ds_raw: FrameT, + orders_ds_raw: FrameT, +) -> FrameT: + var_1 = var_2 = datetime(1995, 3, 15) + var_3 = "BUILDING" + + customer_ds = nw.from_native(customer_ds_raw) + line_item_ds = nw.from_native(line_item_ds_raw) + orders_ds = nw.from_native(orders_ds_raw) + + q_final = ( + customer_ds.filter(nw.col("c_mktsegment") == var_3) + .join(orders_ds, left_on="c_custkey", right_on="o_custkey") + .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .filter( + nw.col("o_orderdate") < var_2, + nw.col("l_shipdate") > var_1, + ) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("revenue") + ) + .group_by(["o_orderkey", "o_orderdate", "o_shippriority"]) + .agg([nw.sum("revenue")]) + .select( + [ + nw.col("o_orderkey").alias("l_orderkey"), + "revenue", + "o_orderdate", + "o_shippriority", + ] + ) + .sort(by=["revenue", "o_orderdate"], descending=[True, False]) + .head(10) + ) + + return nw.to_native(q_final)