diff --git a/.github/workflows/export-parquet.yml b/.github/workflows/export-parquet.yml new file mode 100644 index 0000000..6f659a6 --- /dev/null +++ b/.github/workflows/export-parquet.yml @@ -0,0 +1,23 @@ +name: export-parquet +on: + workflow_dispatch: {} +jobs: + export: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install deps + run: | + python -m pip install --upgrade pip + pip install pandas pyarrow + - name: Export parquet to JSONL + run: | + python scripts/export_parquet.py + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: rows-jsonl + path: rows.jsonl diff --git a/scripts/export_parquet.py b/scripts/export_parquet.py new file mode 100644 index 0000000..691d68e --- /dev/null +++ b/scripts/export_parquet.py @@ -0,0 +1,11 @@ +import pandas as pd +from pathlib import Path +p = Path('SWE-bench_Verified/test-00000-of-00001.parquet') +df = pd.read_parquet(p, columns=['repo','instance_id','environment_setup_commit']) +with open('rows.jsonl','w') as f: + for _, row in df.iterrows(): + f.write({ + 'repo': row['repo'], + 'instance_id': row['instance_id'], + 'environment_setup_commit': row['environment_setup_commit'] + }.__repr__().replace(", ') + n)