Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
tpch_csv10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single csv file per table, hash join
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory

# TPC-DS Benchmarks
tpcds: TPCDS inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join

# Extended TPC-H Benchmarks
sort_tpch: Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=1)
sort_tpch10: Benchmark of sorting speed for end-to-end sort queries on TPC-H dataset (SF=10)
Expand Down Expand Up @@ -216,6 +219,9 @@ main() {
# same data as for tpch10
data_tpch "10"
;;
tpcds)
data_tpcds
;;
clickbench_1)
data_clickbench_1
;;
Expand Down Expand Up @@ -384,6 +390,7 @@ main() {
run_external_aggr
run_nlj
run_hj
run_tpcds
;;
tpch)
run_tpch "1" "parquet"
Expand All @@ -403,6 +410,9 @@ main() {
tpch_mem10)
run_tpch_mem "10"
;;
tpcds)
run_tpcds
;;
cancellation)
run_cancellation
;;
Expand Down Expand Up @@ -589,6 +599,14 @@ data_tpch() {
fi
}

# Points to TPCDS data generation instructions
data_tpcds() {
echo ""
echo "For TPC-DS data generation, please clone the datafusion-benchmarks repository:"
echo " git clone https://github.com/apache/datafusion-benchmarks"
echo ""
}

# Runs the tpch benchmark
run_tpch() {
SCALE_FACTOR=$1
Expand Down Expand Up @@ -622,6 +640,27 @@ run_tpch_mem() {
debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the tpcds benchmark
run_tpcds() {
TPCDS_DIR="${DATA_DIR}"

# Check if TPCDS data directory exists
if [ ! -d "${TPCDS_DIR}" ]; then
echo "Error: TPC-DS data directory does not exist: ${TPCDS_DIR}"
echo ""
echo "Please prepare TPC-DS data first by running:"
echo " ./bench.sh data tpcds"
echo ""
return 1
fi

RESULTS_FILE="${RESULTS_DIR}/tpcds_sf1.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running tpcds benchmark..."

debug_run $CARGO_COMMAND --bin tpcds -- benchmark datafusion --iterations 5 --path "${TPCDS_DIR}" --query_path "${SCRIPT_DIR}/queries/tpcds" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG}
}

# Runs the compile profile benchmark helper
run_compile_profile() {
local profiles=("$@")
Expand Down
26 changes: 26 additions & 0 deletions benchmarks/queries/tpcds/q1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- SQLBench-DS query 1 derived from TPC-DS query 1 under the terms of the TPC Fair Use Policy.
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
-- This query was generated at scale factor 1.
with customer_total_return as
(select sr_customer_sk as ctr_customer_sk
,sr_store_sk as ctr_store_sk
,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return
from store_returns
,date_dim
where sr_returned_date_sk = d_date_sk
and d_year =1999
group by sr_customer_sk
,sr_store_sk)
select c_customer_id
from customer_total_return ctr1
,store
,customer
where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
from customer_total_return ctr2
where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
and s_store_sk = ctr1.ctr_store_sk
and s_state = 'TN'
and ctr1.ctr_customer_sk = c_customer_sk
order by c_customer_id
LIMIT 100;

60 changes: 60 additions & 0 deletions benchmarks/queries/tpcds/q10.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
-- SQLBench-DS query 10 derived from TPC-DS query 10 under the terms of the TPC Fair Use Policy.
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
-- This query was generated at scale factor 1.
select
cd_gender,
cd_marital_status,
cd_education_status,
count(*) cnt1,
cd_purchase_estimate,
count(*) cnt2,
cd_credit_rating,
count(*) cnt3,
cd_dep_count,
count(*) cnt4,
cd_dep_employed_count,
count(*) cnt5,
cd_dep_college_count,
count(*) cnt6
from
customer c,customer_address ca,customer_demographics
where
c.c_current_addr_sk = ca.ca_address_sk and
ca_county in ('Clinton County','Platte County','Franklin County','Louisa County','Harmon County') and
cd_demo_sk = c.c_current_cdemo_sk and
exists (select *
from store_sales,date_dim
where c.c_customer_sk = ss_customer_sk and
ss_sold_date_sk = d_date_sk and
d_year = 2002 and
d_moy between 3 and 3+3) and
(exists (select *
from web_sales,date_dim
where c.c_customer_sk = ws_bill_customer_sk and
ws_sold_date_sk = d_date_sk and
d_year = 2002 and
d_moy between 3 ANd 3+3) or
exists (select *
from catalog_sales,date_dim
where c.c_customer_sk = cs_ship_customer_sk and
cs_sold_date_sk = d_date_sk and
d_year = 2002 and
d_moy between 3 and 3+3))
group by cd_gender,
cd_marital_status,
cd_education_status,
cd_purchase_estimate,
cd_credit_rating,
cd_dep_count,
cd_dep_employed_count,
cd_dep_college_count
order by cd_gender,
cd_marital_status,
cd_education_status,
cd_purchase_estimate,
cd_credit_rating,
cd_dep_count,
cd_dep_employed_count,
cd_dep_college_count
LIMIT 100;

82 changes: 82 additions & 0 deletions benchmarks/queries/tpcds/q11.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
-- SQLBench-DS query 11 derived from TPC-DS query 11 under the terms of the TPC Fair Use Policy.
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
-- This query was generated at scale factor 1.
with year_total as (
select c_customer_id customer_id
,c_first_name customer_first_name
,c_last_name customer_last_name
,c_preferred_cust_flag customer_preferred_cust_flag
,c_birth_country customer_birth_country
,c_login customer_login
,c_email_address customer_email_address
,d_year dyear
,sum(ss_ext_list_price-ss_ext_discount_amt) year_total
,'s' sale_type
from customer
,store_sales
,date_dim
where c_customer_sk = ss_customer_sk
and ss_sold_date_sk = d_date_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address
,d_year
union all
select c_customer_id customer_id
,c_first_name customer_first_name
,c_last_name customer_last_name
,c_preferred_cust_flag customer_preferred_cust_flag
,c_birth_country customer_birth_country
,c_login customer_login
,c_email_address customer_email_address
,d_year dyear
,sum(ws_ext_list_price-ws_ext_discount_amt) year_total
,'w' sale_type
from customer
,web_sales
,date_dim
where c_customer_sk = ws_bill_customer_sk
and ws_sold_date_sk = d_date_sk
group by c_customer_id
,c_first_name
,c_last_name
,c_preferred_cust_flag
,c_birth_country
,c_login
,c_email_address
,d_year
)
select
t_s_secyear.customer_id
,t_s_secyear.customer_first_name
,t_s_secyear.customer_last_name
,t_s_secyear.customer_email_address
from year_total t_s_firstyear
,year_total t_s_secyear
,year_total t_w_firstyear
,year_total t_w_secyear
where t_s_secyear.customer_id = t_s_firstyear.customer_id
and t_s_firstyear.customer_id = t_w_secyear.customer_id
and t_s_firstyear.customer_id = t_w_firstyear.customer_id
and t_s_firstyear.sale_type = 's'
and t_w_firstyear.sale_type = 'w'
and t_s_secyear.sale_type = 's'
and t_w_secyear.sale_type = 'w'
and t_s_firstyear.dyear = 1999
and t_s_secyear.dyear = 1999+1
and t_w_firstyear.dyear = 1999
and t_w_secyear.dyear = 1999+1
and t_s_firstyear.year_total > 0
and t_w_firstyear.year_total > 0
and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else 0.0 end
> case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else 0.0 end
order by t_s_secyear.customer_id
,t_s_secyear.customer_first_name
,t_s_secyear.customer_last_name
,t_s_secyear.customer_email_address
LIMIT 100;

35 changes: 35 additions & 0 deletions benchmarks/queries/tpcds/q12.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
-- SQLBench-DS query 12 derived from TPC-DS query 12 under the terms of the TPC Fair Use Policy.
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
-- This query was generated at scale factor 1.
select i_item_id
,i_item_desc
,i_category
,i_class
,i_current_price
,sum(ws_ext_sales_price) as itemrevenue
,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
(partition by i_class) as revenueratio
from
web_sales
,item
,date_dim
where
ws_item_sk = i_item_sk
and i_category in ('Jewelry', 'Books', 'Women')
and ws_sold_date_sk = d_date_sk
and d_date between cast('2002-03-22' as date)
and (cast('2002-03-22' as date) + 30 days)
group by
i_item_id
,i_item_desc
,i_category
,i_class
,i_current_price
order by
i_category
,i_class
,i_item_id
,i_item_desc
,revenueratio
LIMIT 100;

53 changes: 53 additions & 0 deletions benchmarks/queries/tpcds/q13.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
-- SQLBench-DS query 13 derived from TPC-DS query 13 under the terms of the TPC Fair Use Policy.
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
-- This query was generated at scale factor 1.
select avg(ss_quantity)
,avg(ss_ext_sales_price)
,avg(ss_ext_wholesale_cost)
,sum(ss_ext_wholesale_cost)
from store_sales
,store
,customer_demographics
,household_demographics
,customer_address
,date_dim
where s_store_sk = ss_store_sk
and ss_sold_date_sk = d_date_sk and d_year = 2001
and((ss_hdemo_sk=hd_demo_sk
and cd_demo_sk = ss_cdemo_sk
and cd_marital_status = 'U'
and cd_education_status = '4 yr Degree'
and ss_sales_price between 100.00 and 150.00
and hd_dep_count = 3
)or
(ss_hdemo_sk=hd_demo_sk
and cd_demo_sk = ss_cdemo_sk
and cd_marital_status = 'S'
and cd_education_status = 'Unknown'
and ss_sales_price between 50.00 and 100.00
and hd_dep_count = 1
) or
(ss_hdemo_sk=hd_demo_sk
and cd_demo_sk = ss_cdemo_sk
and cd_marital_status = 'D'
and cd_education_status = '2 yr Degree'
and ss_sales_price between 150.00 and 200.00
and hd_dep_count = 1
))
and((ss_addr_sk = ca_address_sk
and ca_country = 'United States'
and ca_state in ('CO', 'MI', 'MN')
and ss_net_profit between 100 and 200
) or
(ss_addr_sk = ca_address_sk
and ca_country = 'United States'
and ca_state in ('NC', 'NY', 'TX')
and ss_net_profit between 150 and 300
) or
(ss_addr_sk = ca_address_sk
and ca_country = 'United States'
and ca_state in ('CA', 'NE', 'TN')
and ss_net_profit between 50 and 250
))
;

Loading
Loading