From f7f02a8ae68dae49e88b2607b7e159be56eafe89 Mon Sep 17 00:00:00 2001 From: Conor McCarter Date: Thu, 6 Nov 2025 16:51:14 -0800 Subject: [PATCH] Add Databricks and benchmark results for most SQL warehouse options --- README.md | 1 - databricks/.env.example | 22 ++ databricks/NOTES.md | 4 + databricks/README.md | 47 ++++ databricks/benchmark.py | 361 +++++++++++++++++++++++++++++++ databricks/benchmark.sh | 22 ++ databricks/create.sql | 109 ++++++++++ databricks/queries.sql | 43 ++++ databricks/query.py | 88 ++++++++ databricks/results/2x-large.json | 56 +++++ databricks/results/2x-small.json | 56 +++++ databricks/results/4x-large.json | 56 +++++ databricks/results/large.json | 56 +++++ databricks/results/medium.json | 56 +++++ databricks/results/small.json | 56 +++++ databricks/results/x-large.json | 56 +++++ databricks/results/x-small.json | 56 +++++ databricks/run.sh | 15 ++ 18 files changed, 1159 insertions(+), 1 deletion(-) create mode 100644 databricks/.env.example create mode 100644 databricks/NOTES.md create mode 100644 databricks/README.md create mode 100755 databricks/benchmark.py create mode 100755 databricks/benchmark.sh create mode 100644 databricks/create.sql create mode 100644 databricks/queries.sql create mode 100755 databricks/query.py create mode 100644 databricks/results/2x-large.json create mode 100644 databricks/results/2x-small.json create mode 100644 databricks/results/4x-large.json create mode 100644 databricks/results/large.json create mode 100644 databricks/results/medium.json create mode 100644 databricks/results/small.json create mode 100644 databricks/results/x-large.json create mode 100644 databricks/results/x-small.json create mode 100755 databricks/run.sh diff --git a/README.md b/README.md index 60dd15462..2bf0295f1 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,6 @@ Please help us add more systems and run the benchmarks on more types of VMs: - [ ] Azure Synapse - [ ] Boilingdata - [ ] CockroachDB Serverless -- [ ] Databricks - [ ] DolphinDB - [ ] Dremio (without publishing) - [ ] DuckDB operating like "Athena" on remote Parquet files diff --git a/databricks/.env.example b/databricks/.env.example new file mode 100644 index 000000000..a0b353aab --- /dev/null +++ b/databricks/.env.example @@ -0,0 +1,22 @@ +# Databricks Configuration +# Copy this file to .env and fill in your actual values + +# Your Databricks workspace hostname (e.g., dbc-xxxxxxxx-xxxx.cloud.databricks.com) +DATABRICKS_SERVER_HOSTNAME=your-workspace-hostname.cloud.databricks.com + +# SQL Warehouse HTTP path (found in your SQL Warehouse settings) +# Uncomment the warehouse size you want to use +DATABRICKS_HTTP_PATH=/sql/1.0/warehouses/your-warehouse-id + +# Instance type name for results file naming & results machine type label +databricks_instance_type=Large + +# Your Databricks personal access token +DATABRICKS_TOKEN=your-databricks-token + +# Unity Catalog and Schema names +DATABRICKS_CATALOG=clickbench_catalog +DATABRICKS_SCHEMA=clickbench_schema + +# Parquet data location +DATABRICKS_PARQUET_LOCATION=s3://some/path/hits.parquet diff --git a/databricks/NOTES.md b/databricks/NOTES.md new file mode 100644 index 000000000..d29fb021e --- /dev/null +++ b/databricks/NOTES.md @@ -0,0 +1,4 @@ +I created each warehouse in the Databricks UI. +Besides the warehouse size, the only other change I made to default settings was to set the sleep time to 5 minutes to save money (the 4x large warehouse is very expensive). + +Once the warehouse was created, I'd save the warehouse path to use in the .env file for each run. \ No newline at end of file diff --git a/databricks/README.md b/databricks/README.md new file mode 100644 index 000000000..4b4bc47b3 --- /dev/null +++ b/databricks/README.md @@ -0,0 +1,47 @@ +## Setup + +1. Create a Databricks workspace and SQL Warehouse +2. Generate a personal access token from your Databricks workspace +3. Copy `.env.example` to `.env` and fill in your values: + +```bash +cp .env.example .env +# Edit .env with your actual credentials +``` + +Required environment variables: +- `DATABRICKS_SERVER_HOSTNAME`: Your workspace hostname (e.g., `dbc-xxxxxxxx-xxxx.cloud.databricks.com`) +- `DATABRICKS_HTTP_PATH`: SQL Warehouse path (e.g., `/sql/1.0/warehouses/your-warehouse-id`) +- `DATABRICKS_TOKEN`: Your personal access token +- `databricks_instance_type`: Instance type name for results file naming, e.g., "2X-Large" +- `DATABRICKS_CATALOG`: Unity Catalog name +- `DATABRICKS_SCHEMA`: Schema name +- `DATABRICKS_PARQUET_LOCATION`: S3 path to the parquet file + +## Running the Benchmark + +```bash +./benchmark.sh +``` + +## How It Works + +1. **benchmark.sh**: Entry point that installs dependencies via `uv` and runs the benchmark +2. **benchmark.py**: Orchestrates the full benchmark: + - Creates the catalog and schema + - Creates the `hits` table with explicit schema (including TIMESTAMP conversion) + - Loads data from the parquet file using `INSERT INTO` with type conversions + - Runs all queries via `run.sh` + - Collects timing metrics from Databricks REST API + - Outputs results to JSON in the `results/` directory +3. **run.sh**: Iterates through queries.sql and executes each query +4. **query.py**: Executes individual queries and retrieves execution times from Databricks REST API (`/api/2.0/sql/history/queries/{query_id}`) +5. **queries.sql**: Contains the 43 benchmark queries + +## Notes + +- Query execution times are pulled from the Databricks REST API, which provides server-side metrics +- The data is loaded from a parquet file with explicit type conversions (Unix timestamps → TIMESTAMP, Unix dates → DATE) +- The benchmark uses Databricks SQL Connector for Python +- Results include load time, data size, and individual query execution times (3 runs per query) +- Results are saved to `results/{instance_type}.json` diff --git a/databricks/benchmark.py b/databricks/benchmark.py new file mode 100755 index 000000000..497fee222 --- /dev/null +++ b/databricks/benchmark.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +from databricks import sql +import json +import os +import sys +import subprocess +import time +import requests + +def write_result_to_file(run_metadata, query_results): + # Ensure results directory exists + os.makedirs('results', exist_ok=True) + + # Get instance type and convert to lowercase for filename + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + filename = os.path.join('results', instance_type.lower() + ".json") + with open(filename, 'w') as f: + print("{", file=f) + for key in run_metadata: + print(f'\t"{key}": {json.dumps(run_metadata[key])},', file=f) + + print('\t"result": [', file=f) + num_lines = len(query_results) + for i in range(num_lines): + print(f"\t\t{query_results[i]}", end='', file=f) + print("," if i < num_lines - 1 else "", file=f) + + print("\t]\n}", file=f) + +def load_data(run_metadata): + server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') + http_path = os.getenv('DATABRICKS_HTTP_PATH') + access_token = os.getenv('DATABRICKS_TOKEN') + catalog = os.getenv('DATABRICKS_CATALOG') + schema = os.getenv('DATABRICKS_SCHEMA') + parquet_location = os.getenv('DATABRICKS_PARQUET_LOCATION') + + if not all([server_hostname, http_path, access_token, catalog, schema, parquet_location]): + raise Exception("Missing required environment variables: DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, DATABRICKS_TOKEN, DATABRICKS_CATALOG, DATABRICKS_SCHEMA, DATABRICKS_PARQUET_LOCATION") + + print(f'Connecting to Databricks; loading the data into {catalog}.{schema}', file=sys.stderr) + + connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token + ) + + cursor = connection.cursor() + + # Create catalog and schema if they don't exist + cursor.execute(f'CREATE CATALOG IF NOT EXISTS {catalog}') + cursor.execute(f'USE CATALOG {catalog}') + cursor.execute(f'CREATE SCHEMA IF NOT EXISTS {schema}') + cursor.execute(f'USE SCHEMA {schema}') + + print(f'Creating table and loading data from {parquet_location}...', file=sys.stderr) + + # Drop table if exists + cursor.execute(f'DROP TABLE IF EXISTS {catalog}.{schema}.hits') + + # Create table with explicit schema (EventTime as TIMESTAMP) + create_query = f""" + CREATE TABLE {catalog}.{schema}.hits ( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title STRING, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate DATE NOT NULL, + CounterID INT NOT NULL, + ClientIP INT NOT NULL, + RegionID INT NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL STRING, + Referer STRING, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INT NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INT NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 STRING, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor STRING NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel STRING, + Params STRING, + IPNetworkID INT NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase STRING, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INT NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset STRING, + CodeVersion INT NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL STRING, + HID INT NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor STRING NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INT NOT NULL, + WindowName INT NOT NULL, + OpenerName INT NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage STRING, + BrowserCountry STRING, + SocialNetwork STRING, + SocialAction STRING, + HTTPError SMALLINT NOT NULL, + SendTiming INT NOT NULL, + DNSTiming INT NOT NULL, + ConnectTiming INT NOT NULL, + ResponseStartTiming INT NOT NULL, + ResponseEndTiming INT NOT NULL, + FetchTiming INT NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage STRING, + ParamPrice BIGINT NOT NULL, + ParamOrderID STRING, + ParamCurrency STRING, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName STRING, + OpenstatCampaignID STRING, + OpenstatAdID STRING, + OpenstatSourceID STRING, + UTMSource STRING, + UTMMedium STRING, + UTMCampaign STRING, + UTMContent STRING, + UTMTerm STRING, + FromTag STRING, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INT NOT NULL + ) + """ + cursor.execute(create_query) + + # Insert data from parquet file with type conversions + load_query = f""" + INSERT INTO {catalog}.{schema}.hits + SELECT + WatchID, + JavaEnable, + Title, + GoodEvent, + CAST(FROM_UNIXTIME(EventTime) AS TIMESTAMP) AS EventTime, + DATE_FROM_UNIX_DATE(EventDate) AS EventDate, + CounterID, + ClientIP, + RegionID, + UserID, + CounterClass, + OS, + UserAgent, + URL, + Referer, + IsRefresh, + RefererCategoryID, + RefererRegionID, + URLCategoryID, + URLRegionID, + ResolutionWidth, + ResolutionHeight, + ResolutionDepth, + FlashMajor, + FlashMinor, + FlashMinor2, + NetMajor, + NetMinor, + UserAgentMajor, + UserAgentMinor, + CookieEnable, + JavascriptEnable, + IsMobile, + MobilePhone, + MobilePhoneModel, + Params, + IPNetworkID, + TraficSourceID, + SearchEngineID, + SearchPhrase, + AdvEngineID, + IsArtifical, + WindowClientWidth, + WindowClientHeight, + ClientTimeZone, + CAST(FROM_UNIXTIME(ClientEventTime) AS TIMESTAMP) AS ClientEventTime, + SilverlightVersion1, + SilverlightVersion2, + SilverlightVersion3, + SilverlightVersion4, + PageCharset, + CodeVersion, + IsLink, + IsDownload, + IsNotBounce, + FUniqID, + OriginalURL, + HID, + IsOldCounter, + IsEvent, + IsParameter, + DontCountHits, + WithHash, + HitColor, + CAST(FROM_UNIXTIME(LocalEventTime) AS TIMESTAMP) AS LocalEventTime, + Age, + Sex, + Income, + Interests, + Robotness, + RemoteIP, + WindowName, + OpenerName, + HistoryLength, + BrowserLanguage, + BrowserCountry, + SocialNetwork, + SocialAction, + HTTPError, + SendTiming, + DNSTiming, + ConnectTiming, + ResponseStartTiming, + ResponseEndTiming, + FetchTiming, + SocialSourceNetworkID, + SocialSourcePage, + ParamPrice, + ParamOrderID, + ParamCurrency, + ParamCurrencyID, + OpenstatServiceName, + OpenstatCampaignID, + OpenstatAdID, + OpenstatSourceID, + UTMSource, + UTMMedium, + UTMCampaign, + UTMContent, + UTMTerm, + FromTag, + HasGCLID, + RefererHash, + URLHash, + CLID + FROM parquet.`{parquet_location}` + """ + + cursor.execute(load_query) + load_query_id = cursor.query_id + + # Get load time from REST API + print(f"Getting load time for query {load_query_id}...", file=sys.stderr) + max_retries = 3 + + for retry in range(max_retries): + time.sleep(2) + + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{load_query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + load_duration = round(data['duration'] / 1000.0, 3) + run_metadata["load_time"] = load_duration + print(f"Table created successfully in {load_duration}s", file=sys.stderr) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + # Get table size from DESCRIBE DETAIL + cursor.execute(f"DESCRIBE DETAIL {catalog}.{schema}.hits") + result = cursor.fetchone() + run_metadata["data_size"] = result[10] # sizeInBytes column + print(f"Table size: {run_metadata['data_size']} bytes", file=sys.stderr) + + print(f'Finished loading the data in {run_metadata["load_time"]}s; data size = {run_metadata["data_size"]} bytes', file=sys.stderr) + + cursor.close() + connection.close() + +def run_queries(): + # Run the benchmark script + result = subprocess.run( + ["./run.sh"], + stdout=subprocess.PIPE, + text=True, + timeout=3600, # 1 hour timeout + ) + + if result.returncode != 0: + raise Exception(f"Benchmark failed with return code {result.returncode}") + + return result.stdout + +if __name__ == "__main__": + instance_type = os.getenv('databricks_instance_type') + if not instance_type: + raise Exception("Missing required environment variable: databricks_instance_type") + + run_metadata = { + "system": "Databricks", + "date": time.strftime("%Y-%m-%d"), + "machine": f"Databricks: {instance_type}", + "cluster_size": "", + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + } + + load_data(run_metadata) + + query_output = run_queries() + + write_result_to_file(run_metadata, query_output.strip().split('\n')) diff --git a/databricks/benchmark.sh b/databricks/benchmark.sh new file mode 100755 index 000000000..6928ba071 --- /dev/null +++ b/databricks/benchmark.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Load environment variables +if [ -f .env ]; then + set -a + source .env + set +a +else + echo "Error: .env file not found. Please copy .env.example to .env and fill in your credentials." + exit 1 +fi + +# Create virtual environment if it doesn't exist +if [ ! -d ".venv" ]; then + uv venv +fi + +# Install dependencies +uv pip install databricks-sql-connector + +# Run benchmark +uv run python ./benchmark.py 2>&1 diff --git a/databricks/create.sql b/databricks/create.sql new file mode 100644 index 000000000..729e40370 --- /dev/null +++ b/databricks/create.sql @@ -0,0 +1,109 @@ +-- This is not used in the setup script, but is included here for reference. +-- The actual table is created in benchmark.py +CREATE OR REPLACE TABLE hits ( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title STRING, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate DATE NOT NULL, + CounterID INT NOT NULL, + ClientIP INT NOT NULL, + RegionID INT NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL STRING, + Referer STRING, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INT NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INT NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 STRING, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor STRING NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel STRING, + Params STRING, + IPNetworkID INT NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase STRING, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INT NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset STRING, + CodeVersion INT NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL STRING, + HID INT NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor STRING NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INT NOT NULL, + WindowName INT NOT NULL, + OpenerName INT NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage STRING, + BrowserCountry STRING, + SocialNetwork STRING, + SocialAction STRING, + HTTPError SMALLINT NOT NULL, + SendTiming INT NOT NULL, + DNSTiming INT NOT NULL, + ConnectTiming INT NOT NULL, + ResponseStartTiming INT NOT NULL, + ResponseEndTiming INT NOT NULL, + FetchTiming INT NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage STRING, + ParamPrice BIGINT NOT NULL, + ParamOrderID STRING, + ParamCurrency STRING, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName STRING, + OpenstatCampaignID STRING, + OpenstatAdID STRING, + OpenstatSourceID STRING, + UTMSource STRING, + UTMMedium STRING, + UTMCampaign STRING, + UTMContent STRING, + UTMTerm STRING, + FromTag STRING, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INT NOT NULL +); \ No newline at end of file diff --git a/databricks/queries.sql b/databricks/queries.sql new file mode 100644 index 000000000..8fafcbcf9 --- /dev/null +++ b/databricks/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '$1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/databricks/query.py b/databricks/query.py new file mode 100755 index 000000000..0ec552c36 --- /dev/null +++ b/databricks/query.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +from databricks import sql +import os +import sys +import time +import requests + +query = sys.stdin.read() +print(f"running {query}", file=sys.stderr) + +# Get connection parameters from environment variables +server_hostname = os.getenv('DATABRICKS_SERVER_HOSTNAME') +http_path = os.getenv('DATABRICKS_HTTP_PATH') +access_token = os.getenv('DATABRICKS_TOKEN') +catalog = os.getenv('DATABRICKS_CATALOG', 'main') +schema = os.getenv('DATABRICKS_SCHEMA', 'clickbench') + +if not all([server_hostname, http_path, access_token]): + print("Error: Missing required environment variables:", file=sys.stderr) + print(" DATABRICKS_SERVER_HOSTNAME", file=sys.stderr) + print(" DATABRICKS_HTTP_PATH", file=sys.stderr) + print(" DATABRICKS_TOKEN", file=sys.stderr) + sys.exit(1) + +connection = sql.connect( + server_hostname=server_hostname, + http_path=http_path, + access_token=access_token, + catalog=catalog, + schema=schema +) + +print('[', end='') + +for try_num in range(3): + if try_num > 0: + print(',', end='') + + try: + cursor = connection.cursor() + + # Execute the query + cursor.execute(query) + results = cursor.fetchall() + query_id = cursor.query_id + + # Get execution time from REST API + duration = None + max_retries = 3 + + for retry in range(max_retries): + # Wait a moment for query to complete and be available + time.sleep(1 if retry == 0 else 2) + + # Call the query history API + url = f"https://{server_hostname}/api/2.0/sql/history/queries/{query_id}" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + data = response.json() + if 'duration' in data: + # Duration is in milliseconds, convert to seconds + duration = round(data['duration'] / 1000.0, 3) + break + except Exception as api_error: + print(f"API error on retry {retry + 1}: {api_error}", file=sys.stderr) + + if duration is None: + # Fallback: if metrics aren't available after retries, use null + duration = 'null' + print(f"Could not retrieve metrics for query_id {query_id} after {max_retries} retries", file=sys.stderr) + + print(duration if isinstance(duration, str) else duration, end='') + + cursor.close() + except Exception as e: + print('null', end='') + print(f"query <{query.strip()}> errored out on attempt <{try_num+1}>: {e}", file=sys.stderr) + +print(']') + +connection.close() diff --git a/databricks/results/2x-large.json b/databricks/results/2x-large.json new file mode 100644 index 000000000..0c4709633 --- /dev/null +++ b/databricks/results/2x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 2X-Large", + "cluster_size": 64, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 25.978, + "data_size": 10219802927, + "result": [ + [0.747, 0.108, 0.098], + [2.372, 0.107, 0.109], + [0.601, 0.095, 0.459], + [0.606, 0.11, 0.123], + [0.806, 0.115, 0.108], + [0.934, 0.108, 0.112], + [0.72, 0.107, 0.114], + [0.444, 0.099, 0.111], + [1.023, 0.103, 0.115], + [1.094, 0.108, 0.113], + [0.674, 0.097, 0.101], + [0.688, 0.103, 0.104], + [0.755, 0.104, 0.098], + [0.874, 0.104, 0.102], + [0.844, 0.096, 0.097], + [0.669, 0.101, 0.112], + [0.894, 0.096, 0.102], + [0.614, 0.095, 0.094], + [1.254, 0.121, 0.111], + [0.382, 0.092, 0.094], + [0.908, 0.095, 0.091], + [0.617, 0.113, 0.1], + [1.029, 0.115, 0.104], + [1.392, 0.1, 0.114], + [0.389, 0.108, 0.1], + [0.359, 0.1, 0.09], + [0.386, 0.1, 0.096], + [0.616, 0.102, 0.099], + [3.37, 0.103, 0.108], + [0.639, 0.15, 0.136], + [0.643, 0.095, 0.096], + [0.688, 0.104, 0.098], + [0.914, 0.094, 0.109], + [1.573, 0.107, 0.098], + [1.605, 0.108, 0.109], + [0.646, 0.108, 0.099], + [0.66, 0.112, 0.104], + [0.426, 0.093, 0.095], + [0.546, 0.108, 0.101], + [1.081, 0.102, 0.129], + [0.53, 0.103, 0.101], + [0.418, 0.131, 0.103], + [0.443, 0.105, 0.107] + ] +} diff --git a/databricks/results/2x-small.json b/databricks/results/2x-small.json new file mode 100644 index 000000000..9de0aa0a3 --- /dev/null +++ b/databricks/results/2x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 2X-Small", + "cluster_size": 1, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 125.99, + "data_size": 10219802927, + "result": [ + [0.714, 0.128, 0.13], + [1.382, 0.129, 0.128], + [0.883, 0.129, 0.117], + [0.984, 0.12, 0.127], + [1.447, 0.119, 0.121], + [2.362, 0.117, 0.101], + [0.848, 0.113, 0.103], + [0.509, 0.104, 0.108], + [2.435, 0.108, 0.107], + [3.127, 0.118, 0.108], + [1.063, 0.104, 0.109], + [1.015, 0.12, 0.103], + [2.953, 0.121, 0.098], + [3.39, 0.125, 0.115], + [3.569, 0.101, 0.135], + [2.122, 0.12, 0.102], + [5.774, 0.098, 0.132], + [3.587, 0.104, 0.103], + [9.929, 0.147, 0.101], + [0.389, 0.104, 0.099], + [3.301, 0.102, 0.099], + [2.708, 0.104, 0.099], + [5.225, 0.113, 0.133], + [13.016, 0.105, 0.106], + [1.329, 0.103, 0.107], + [0.903, 0.099, 0.097], + [1.249, 0.13, 0.093], + [2.484, 0.108, 0.109], + [20.366, 0.119, 0.104], + [1.298, 0.154, 0.147], + [1.915, 0.096, 0.11], + [2.117, 0.095, 0.104], + [7.453, 0.119, 0.092], + [11.578, 0.095, 0.093], + [10.26, 0.113, 0.097], + [2.217, 0.101, 0.095], + [0.664, 0.11, 0.101], + [0.441, 0.092, 0.106], + [0.683, 0.111, 0.106], + [1.11, 0.106, 0.114], + [0.539, 0.101, 0.095], + [0.434, 0.361, 0.096], + [0.483, 0.097, 0.105] + ] +} diff --git a/databricks/results/4x-large.json b/databricks/results/4x-large.json new file mode 100644 index 000000000..62224ed5c --- /dev/null +++ b/databricks/results/4x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: 4X-Large", + "cluster_size": 256, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 38.981, + "data_size": 10219802927, + "result": [ + [0.597, 0.158, 0.116], + [3.236, 0.113, 0.103], + [0.619, 0.105, 0.101], + [0.681, 0.132, 0.107], + [1.405, 0.108, 0.099], + [1.105, 0.103, 0.095], + [0.719, 0.101, 0.1], + [3.267, 0.104, 0.104], + [4.144, 0.101, 0.115], + [1.431, 0.108, 0.114], + [2.427, 0.101, 0.119], + [6.409, 0.11, 0.101], + [4.826, 0.096, 0.103], + [1.097, 0.102, 0.108], + [1.116, 0.103, 0.103], + [0.794, 0.109, 0.14], + [0.962, 0.095, 0.096], + [0.626, 0.096, 0.096], + [1.732, 0.125, 0.102], + [0.319, 0.092, 0.105], + [0.981, 0.096, 0.123], + [0.614, 0.103, 0.111], + [1.069, 0.1, 0.104], + [1.419, 0.105, 0.107], + [0.382, 0.091, 0.105], + [0.321, 0.104, 0.099], + [0.368, 0.108, 0.09], + [0.71, 0.099, 0.095], + [3.437, 0.109, 0.112], + [0.685, 0.157, 0.166], + [0.743, 0.099, 0.159], + [1.045, 0.094, 0.097], + [1.117, 0.095, 0.102], + [1.796, 0.093, 0.099], + [1.855, 0.097, 0.112], + [0.746, 0.104, 0.097], + [0.67, 0.104, 0.099], + [0.432, 0.094, 0.094], + [0.596, 0.095, 0.125], + [1.126, 0.105, 0.104], + [0.522, 0.103, 0.103], + [0.453, 0.126, 0.107], + [0.454, 0.106, 0.106] + ] +} diff --git a/databricks/results/large.json b/databricks/results/large.json new file mode 100644 index 000000000..afc2cabdc --- /dev/null +++ b/databricks/results/large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Large", + "cluster_size": 16, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 20.207, + "data_size": 10219802927, + "result": [ + [0.606, 0.113, 0.122], + [2.986, 0.111, 0.108], + [0.592, 0.108, 0.105], + [0.704, 0.103, 0.103], + [0.814, 0.105, 0.108], + [0.897, 0.1, 0.115], + [0.776, 0.1, 0.104], + [0.437, 0.105, 0.105], + [1.137, 0.104, 0.1], + [1.104, 0.111, 0.115], + [0.796, 0.105, 0.105], + [0.702, 0.113, 0.104], + [0.669, 0.099, 0.104], + [0.935, 0.102, 0.101], + [0.816, 0.106, 0.11], + [0.602, 0.103, 0.092], + [0.86, 0.11, 0.107], + [0.875, 0.1, 0.095], + [1.851, 0.116, 0.107], + [0.328, 0.096, 0.093], + [1.007, 0.11, 0.11], + [0.615, 0.108, 0.1], + [1.385, 0.109, 0.124], + [1.913, 0.108, 0.102], + [0.403, 0.092, 0.098], + [0.336, 0.09, 0.097], + [0.392, 0.271, 0.107], + [0.724, 0.103, 0.105], + [3.35, 0.1, 0.103], + [0.621, 0.147, 0.137], + [0.695, 0.094, 0.09], + [0.744, 0.092, 0.093], + [1.463, 0.106, 0.095], + [1.776, 0.107, 0.098], + [1.821, 0.112, 0.115], + [0.597, 0.106, 0.102], + [0.843, 0.118, 0.114], + [0.47, 0.095, 0.18], + [0.544, 0.109, 0.109], + [1.071, 0.119, 0.102], + [0.501, 0.114, 0.095], + [0.424, 0.12, 0.095], + [0.43, 0.101, 0.112] + ] +} diff --git a/databricks/results/medium.json b/databricks/results/medium.json new file mode 100644 index 000000000..f2fb57124 --- /dev/null +++ b/databricks/results/medium.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Medium", + "cluster_size": 1, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 58.818, + "data_size": 10219802927, + "result": [ + [0.655, 0.125, 0.12], + [2.588, 0.11, 0.114], + [0.872, 0.122, 0.109], + [0.895, 0.103, 0.117], + [0.829, 0.11, 0.108], + [1.106, 0.106, 0.108], + [0.934, 0.106, 0.101], + [0.607, 0.103, 0.11], + [1.303, 0.124, 0.109], + [1.28, 0.119, 0.143], + [0.836, 0.109, 0.103], + [0.818, 0.112, 0.103], + [0.796, 0.108, 0.105], + [0.941, 0.104, 0.111], + [1.004, 0.221, 0.105], + [0.72, 0.101, 0.107], + [1.32, 0.097, 0.106], + [0.872, 0.108, 0.1], + [2.514, 0.12, 0.097], + [0.374, 0.096, 0.096], + [1.302, 0.102, 0.095], + [0.784, 0.113, 0.106], + [1.676, 0.114, 0.103], + [2.723, 0.105, 0.117], + [0.499, 0.098, 0.095], + [0.407, 0.094, 0.108], + [0.482, 0.107, 0.102], + [0.792, 0.115, 0.111], + [3.748, 0.105, 0.113], + [0.695, 0.158, 0.15], + [0.702, 0.169, 0.12], + [0.818, 0.098, 0.11], + [2.375, 0.147, 0.099], + [2.13, 0.101, 0.099], + [2.07, 0.104, 0.108], + [0.729, 0.099, 0.101], + [0.699, 0.112, 0.112], + [0.488, 0.095, 0.097], + [0.62, 0.103, 0.106], + [1.167, 0.106, 0.109], + [0.525, 0.107, 0.109], + [0.454, 0.141, 0.119], + [0.477, 0.122, 0.101] + ] +} diff --git a/databricks/results/small.json b/databricks/results/small.json new file mode 100644 index 000000000..7a9721053 --- /dev/null +++ b/databricks/results/small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: Small", + "cluster_size": 4, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 57.949, + "data_size": 10219802927, + "result": [ + [0.61, 0.104, 0.114], + [1.338, 0.1, 0.108], + [0.627, 0.098, 0.112], + [0.752, 0.106, 0.097], + [0.9, 0.109, 0.095], + [1.183, 0.102, 0.099], + [0.782, 0.098, 0.098], + [0.427, 0.108, 0.116], + [1.279, 0.102, 0.106], + [1.459, 0.112, 0.113], + [0.799, 0.11, 0.108], + [0.795, 0.109, 0.097], + [1.054, 0.101, 0.102], + [1.199, 0.102, 0.108], + [1.138, 0.101, 0.1], + [0.841, 0.098, 0.097], + [3.005, 0.12, 0.096], + [1.282, 0.094, 0.09], + [4.636, 0.105, 0.131], + [0.385, 0.087, 0.099], + [1.598, 0.092, 0.096], + [1.028, 0.102, 0.105], + [1.797, 0.108, 0.101], + [4.426, 0.102, 0.105], + [0.746, 0.096, 0.114], + [0.452, 0.103, 0.095], + [0.576, 0.122, 0.103], + [0.984, 0.102, 0.107], + [6.114, 0.105, 0.112], + [0.751, 0.204, 0.13], + [0.827, 0.104, 0.093], + [1.169, 0.101, 0.091], + [4.099, 0.095, 0.092], + [3.929, 0.106, 0.09], + [3.93, 0.095, 0.099], + [0.971, 0.1, 0.096], + [0.692, 0.106, 0.097], + [0.457, 0.1, 0.099], + [0.571, 0.097, 0.107], + [1.181, 0.113, 0.099], + [0.498, 0.156, 0.11], + [0.467, 0.104, 0.1], + [0.436, 0.104, 0.115] + ] +} diff --git a/databricks/results/x-large.json b/databricks/results/x-large.json new file mode 100644 index 000000000..c2207e129 --- /dev/null +++ b/databricks/results/x-large.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: X-Large", + "cluster_size": 32, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 30.201, + "data_size": 10219802927, + "result": [ + [0.625, 0.126, 0.119], + [1.28, 0.115, 0.11], + [0.765, 0.107, 0.111], + [0.678, 0.11, 0.128], + [0.771, 0.109, 0.106], + [0.879, 0.107, 0.114], + [0.714, 0.113, 0.105], + [2.3, 0.108, 0.111], + [3.976, 0.114, 0.106], + [1.284, 0.116, 0.114], + [0.716, 0.107, 0.103], + [0.738, 0.113, 0.106], + [0.778, 0.12, 0.108], + [0.818, 0.105, 0.116], + [0.839, 0.102, 0.124], + [2.784, 0.124, 0.099], + [1.041, 0.097, 0.099], + [2.169, 0.11, 0.094], + [3.134, 0.122, 0.098], + [0.492, 0.093, 0.093], + [1.983, 0.102, 0.092], + [0.629, 0.107, 0.103], + [1.217, 0.1, 0.101], + [1.853, 0.11, 0.123], + [0.455, 0.105, 0.09], + [0.335, 0.099, 0.1], + [0.431, 0.094, 0.098], + [0.668, 0.096, 0.105], + [3.355, 0.109, 0.113], + [0.679, 0.157, 0.143], + [0.629, 0.097, 0.106], + [0.827, 0.11, 0.111], + [1.064, 0.102, 0.116], + [1.913, 0.106, 0.094], + [1.906, 0.111, 0.106], + [0.665, 0.108, 0.106], + [0.653, 0.1, 0.111], + [0.422, 0.102, 0.097], + [0.555, 0.113, 0.108], + [1.054, 0.103, 0.114], + [0.497, 0.105, 0.103], + [0.402, 0.139, 0.102], + [0.446, 0.108, 0.106] + ] +} diff --git a/databricks/results/x-small.json b/databricks/results/x-small.json new file mode 100644 index 000000000..65fe38aad --- /dev/null +++ b/databricks/results/x-small.json @@ -0,0 +1,56 @@ +{ + "system": "Databricks", + "date": "2025-11-06", + "machine": "Databricks: X-Small", + "cluster_size": 2, + "proprietary": "yes", + "tuned": "no", + "tags": ["managed", "column-oriented"], + "load_time": 78.857, + "data_size": 10219802927, + "result": [ + [0.737, 0.228, 0.15], + [1.412, 0.131, 0.131], + [0.807, 0.123, 0.116], + [0.932, 0.115, 0.125], + [1.119, 0.116, 0.116], + [1.743, 0.158, 0.114], + [0.887, 0.152, 0.105], + [0.515, 0.102, 0.124], + [1.793, 0.104, 0.108], + [1.905, 0.132, 0.113], + [0.884, 0.104, 0.103], + [0.867, 0.125, 0.106], + [1.453, 0.117, 0.107], + [1.863, 0.108, 0.122], + [2.464, 0.126, 0.101], + [1.315, 0.103, 0.102], + [3.63, 0.102, 0.125], + [2.14, 0.114, 0.099], + [6.03, 0.11, 0.1], + [0.374, 0.106, 0.098], + [1.982, 0.105, 0.101], + [1.621, 0.121, 0.1], + [2.877, 0.107, 0.102], + [6.877, 0.112, 0.108], + [0.833, 0.099, 0.097], + [0.573, 0.097, 0.114], + [0.851, 0.105, 0.107], + [1.522, 0.11, 0.11], + [11.223, 0.126, 0.11], + [0.95, 0.142, 0.129], + [1.263, 0.102, 0.097], + [1.902, 0.103, 0.111], + [5.577, 0.113, 0.099], + [5.908, 0.103, 0.095], + [6.459, 0.122, 0.121], + [1.433, 0.111, 0.104], + [0.837, 0.098, 0.1], + [0.475, 0.102, 0.103], + [0.708, 0.109, 0.108], + [1.196, 0.109, 0.113], + [0.473, 0.114, 0.101], + [0.409, 0.128, 0.113], + [0.443, 0.11, 0.212] + ] +} diff --git a/databricks/run.sh b/databricks/run.sh new file mode 100755 index 000000000..b335f912b --- /dev/null +++ b/databricks/run.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Determine the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Use venv python if available, otherwise system python +if [ -f "$SCRIPT_DIR/.venv/bin/python3" ]; then + PYTHON="$SCRIPT_DIR/.venv/bin/python3" +else + PYTHON="python3" +fi + +cat queries.sql | while read -r query; do + $PYTHON ./query.py <<< "${query}" +done