-
Notifications
You must be signed in to change notification settings - Fork 608
85 lines (65 loc) · 2.42 KB
/
ibis-benchmarks.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
name: Benchmarks
on:
push:
branches:
- main
- "*.x.x"
merge_group:
# since we're writing to cloud storage, we don't want to have multiple
# instances of this job running at one time
concurrency: benchmarks-${{ github.repository }}
jobs:
benchmarks:
runs-on: ubuntu-latest
# this allows extractions/setup-just to list releases for `just` at a higher
# rate limit while restricting GITHUB_TOKEN permissions elsewhere
permissions:
contents: "read"
# required for GCP workload identity federation
id-token: "write"
steps:
- name: checkout
uses: actions/checkout@v4
- name: install python
uses: actions/setup-python@v5
id: install_python
with:
python-version: "3.11"
- name: install uv
uses: astral-sh/setup-uv@v5.2.1
- name: install system dependencies
run: sudo apt-get install -qq -y build-essential libgeos-dev freetds-dev unixodbc-dev
- name: make benchmark output dir
run: mkdir .benchmarks
- name: benchmark
run: uv run --all-extras --group tests pytest --benchmark-enable --benchmark-json .benchmarks/output.json ibis/tests/benchmarks
- uses: google-github-actions/auth@v2
with:
project_id: "ibis-gbq"
workload_identity_provider: "${{ vars.WIF_PROVIDER_NAME }}"
- uses: google-github-actions/setup-gcloud@v2
- name: show gcloud info
run: gcloud info
- name: download the latest duckdb release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -euo pipefail
gh release download -R duckdb/duckdb --pattern 'duckdb_cli-linux-amd64.zip'
unzip duckdb_cli-linux-amd64.zip
- name: convert json data to parquet
run: |
set -euo pipefail
# sort json keys
jq --sort-keys -rcM < "$PWD/.benchmarks/output.json" > output.json
# connect to a file to allow spilling to disk
./duckdb json2parquet.ddb <<EOF
COPY (
SELECT * FROM read_ndjson_auto('output.json', maximum_object_size=2**27)
) TO 'output.parquet' (FORMAT PARQUET, COMPRESSION ZSTD)
EOF
- name: copy data to gcs
run: |
set -euo pipefail
timestamp="$(date --iso-8601=ns --utc | tr ',' '.')"
gsutil cp output.parquet "gs://ibis-benchmark-data/ci/${timestamp}.parquet"