diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7f40213 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +.github +dist +docs +examples +experiments +result +*.tar diff --git a/.gitignore b/.gitignore index c4e9286..28c1f7f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,15 @@ .* !.gitignore !.github +!.dockerignore # Python *.py[cod] *.egg-info # Output +*.zip +*.tar /dist /venv +/result diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1c8a651 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM ubuntu:24.10 + +# Install Python and Poetry +RUN apt update &&\ + apt install -y python3 curl git &&\ + curl -sSL https://install.python-poetry.org | python3 - + +# Install Lean +RUN curl -sSfL https://github.com/leanprover/elan/releases/download/v3.1.1/elan-x86_64-unknown-linux-gnu.tar.gz | tar xz &&\ + ./elan-init -y --default-toolchain none +ENV PATH="$PATH:/root/.local/bin:/root/.elan/bin" + +COPY . /root/pantograph +WORKDIR /root/pantograph + +RUN poetry build &&\ + poetry install diff --git a/experiments/ARTEFACT.md b/experiments/ARTEFACT.md new file mode 100644 index 0000000..2622d7d --- /dev/null +++ b/experiments/ARTEFACT.md @@ -0,0 +1,28 @@ +# Artefact Packing + +This is about how to pack the artefacts described in `README.md`. + +## Execute the DSP experiments + +Run the DSP experiments from the project root, +```sh +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/valid.jsonl --format minif2f --output result/ref-4o/valid1 +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/valid.jsonl --format minif2f --n-samples 3 --output result/ref-4o/valid3 +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/test.jsonl --format minif2f --output result/ref-4o/test1 +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/test.jsonl --format minif2f --n-samples 3 --output result/ref-4o/test3 + +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/valid.jsonl --format minif2f --model o1-preview --output result/ref-o1/valid1 +python experiments/dsp/main.py eval \ + --dataset experiments/minif2f/test.jsonl --format minif2f --model o1-preview --output result/ref-o1/test1 +``` +Then, pack it with the script + +```sh +bash experiments/dsp/pack-artefact.sh +``` + diff --git a/experiments/README.md b/experiments/README.md new file mode 100644 index 0000000..434a7f8 --- /dev/null +++ b/experiments/README.md @@ -0,0 +1,85 @@ +# Experiments + +Unpack the `artefact.zip` file. Then cd into the unpacked directory and execute +the commands below. + +## Environment + +The experiments require +- Internet access +- At least 15G of disk space on the drive hosting this repository +- An [OpenAI API](https://openai.com/index/openai-api/) key, for invoking the models. + +Set the `OPENAI_API_KEY` environment variable to the API key. + +### Building/Loading the container + +There are 3 ways to load the container. Choose one. +- (Artefact reviewer) If you **have** the docker image: +``` sh +docker load --input pantograph.tar +``` +- Use the docker repository +``` sh +docker pull chrysoberyl/pantograph +``` +- Build the docker image from scratch +``` sh +docker build . --tag chrysoberyl/pantograph +``` + +The docker image's name **must be** `chrysoberyl/pantograph` for the shell +scripts below to work. + +## Experiments + +The experiments are bound by limitations of OpenAI. Since OpenAI as a commercial +company cannot indefinitely store all snapshots of their models, the experiments +rely on OpenAI's provided version of `o1-preview` and `gpt-4o` models. This may +impact the reproducibility of the experiments in the future. + +### Early Evaluation + +Due to the nature of this project and how Lean's build system works, the first +pass needs to build the Mathlib library. This building of the library will take +about 30 minutes to run. + +Execute in the project root directory +``` sh +experiments/dsp/early.sh +``` +the results will be written to `result/` + +All experiments are cached, so if they crash halfway (which really should not +happen) you can continue to run them and they'll pick up from where they left +off. Hence you can delete the relevant folders in `result/` to force an +experiment to restart from the beginning. + +### Plots for the paper + +To generate the plots for the paper, execute + +``` sh +bash experiments/dsp/ref-plots.sh +``` + +which will output the plots in `result/ref-{o1,4o}/plot` based on the provided +experiment result data in `result/ref-*` + +### GPT-4o experiment + +``` sh +bash experiments/dsp/all-4o.sh +bash experiments/dsp/plot-4o.sh +``` + +which will output the plots in `result/4o/plot` + +### o1-preview experiment + +``` sh +bash experiments/dsp/all-o1.sh +bash experiments/dsp/plot-o1.sh +``` + +which will output the plots in `result/o1/plot` diff --git a/experiments/dsp/all-4o.sh b/experiments/dsp/all-4o.sh new file mode 100755 index 0000000..93689ea --- /dev/null +++ b/experiments/dsp/all-4o.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +IMAGE=chrysoberyl/pantograph +BASE_DIR=/data/result/4o +main() { + docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/main.py eval $@ +} +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --format minif2f --output $BASE_DIR/valid1 +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --n-samples 3 --format minif2f --output $BASE_DIR/valid3 +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --format minif2f --output $BASE_DIR/test1 +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --n-samples 3 --format minif2f --output $BASE_DIR/test3 diff --git a/experiments/dsp/all-o1.sh b/experiments/dsp/all-o1.sh new file mode 100755 index 0000000..2c3c9af --- /dev/null +++ b/experiments/dsp/all-o1.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +IMAGE=chrysoberyl/pantograph +BASE_DIR=/data/result/o1 + +main() { + docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/main.py eval $@ +} +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --format minif2f --model o1-preview --output $BASE_DIR/valid1 +main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --format minif2f --model o1-preview --output $BASE_DIR/test1 diff --git a/experiments/dsp/early.sh b/experiments/dsp/early.sh new file mode 100755 index 0000000..0c87185 --- /dev/null +++ b/experiments/dsp/early.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +IMAGE=chrysoberyl/pantograph + +docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/main.py eval --output /data/result/debug diff --git a/experiments/dsp/lean_src_proj/LeanSrcProj.lean b/experiments/dsp/lean_src_proj/LeanSrcProj.lean index e99d3a6..399825c 100644 --- a/experiments/dsp/lean_src_proj/LeanSrcProj.lean +++ b/experiments/dsp/lean_src_proj/LeanSrcProj.lean @@ -1 +1,2 @@ -def hello := "world" \ No newline at end of file +import Mathlib +import Aesop diff --git a/experiments/dsp/main.py b/experiments/dsp/main.py index 8437b85..57407b6 100644 --- a/experiments/dsp/main.py +++ b/experiments/dsp/main.py @@ -1,4 +1,5 @@ import sys, os, json, subprocess, time, datetime +from abc import abstractmethod from pathlib import Path from dataclasses import asdict from typing import Union, Any, Tuple, Optional @@ -39,6 +40,13 @@ def __init__(self): def __call__(self, *args, **kwards): pass + @abstractmethod + def sample_draft(self, prompt: str): + pass + @abstractmethod + def sample_sketch(self, prompt: str): + pass + class OpenAI_DSP_Engine(Engine): def __init__( self, @@ -88,6 +96,7 @@ def role_prompt(self) -> str: def sample_draft(self, prompt: str): extra = {} if self.model.startswith("o1") else dict( + seed=0, temperature=self.draft_sampling_params.temperature, top_p=self.draft_sampling_params.top_p, stop=self.draft_sampling_params.stop[:3], @@ -103,6 +112,7 @@ def sample_draft(self, prompt: str): ) def sample_sketch(self, prompt: str): extra = {} if self.model.startswith("o1") else dict( + seed=0, temperature=self.sketch_sampling_params.temperature, top_p=self.sketch_sampling_params.top_p, ) @@ -255,14 +265,20 @@ def single_proof_search_dsp_lean( ) -> DatumResult: start_time = time.time() - # -- Draft: [y_nl_pred_draft]_n ~ draft(eng, x_nl_prob, P_draft) - y_nl_pred_drafts = step_draft(eng, datum) - - # -- Sketch: z_fl_pred_sketch ~ sketch(eng, x_nl_prob, [y_nl_pred_draft]_n, x_fl_prob, P_sketch) - z_fl_pred_sketches, x_fl_prob = step_sketch(eng, datum, y_nl_pred_drafts) + try: + # -- Draft: [y_nl_pred_draft]_n ~ draft(eng, x_nl_prob, P_draft) + y_nl_pred_drafts = step_draft(eng, datum) - assert len(z_fl_pred_sketches) == eng.sketch_sampling_params.n + # -- Sketch: z_fl_pred_sketch ~ sketch(eng, x_nl_prob, [y_nl_pred_draft]_n, x_fl_prob, P_sketch) + z_fl_pred_sketches, x_fl_prob = step_sketch(eng, datum, y_nl_pred_drafts) + assert len(z_fl_pred_sketches) == eng.sketch_sampling_params.n + except Exception as e: + print(colored(f"Failed to create sketch/draft: {e}", "red")) + return DatumResult( + name=str(datum), + error=str(e), + ) results = [] success = False @@ -273,17 +289,25 @@ def single_proof_search_dsp_lean( try: server = server_func() except Exception as e: - print(colored("Failed to create server: {e}", "red")) + print(colored(f"Failed to create server: {e}", "red")) return DatumResult( name=str(datum), error=str(e), ) - # -- Prove: y_fl = prove(eng, x_fl_prob, z_fl_pred_sketches) - prove_result = step_prove(eng, server, x_fl_prob, sketch) - results.append(prove_result) - if isinstance(prove_result, SearchResult) and prove_result.success: - success = True - break + try: + # -- Prove: y_fl = prove(eng, x_fl_prob, z_fl_pred_sketches) + prove_result = step_prove(eng, server, x_fl_prob, sketch) + results.append(prove_result) + if isinstance(prove_result, SearchResult) and prove_result.success: + success = True + break + except Exception as e: + print(colored(f"Search failed: {e}", "red")) + return DatumResult( + name=str(datum), + error=str(e), + ) + duration = time.time() - start_time return DatumResult( @@ -329,11 +353,6 @@ def full_proof_search_dsp_lean( experiment_dir = Path(__file__).resolve().parent -def get_project_and_lean_path(): - cwd = experiment_dir / 'lean_src_proj' - p = subprocess.check_output(['lake', 'env', 'printenv', 'LEAN_PATH'], cwd=cwd) - return cwd, p - def load_data(args) -> list[Datum]: p = Path(args.dataset).expanduser() data = None @@ -363,13 +382,21 @@ def main(args): path_output = Path(args.output) path_output.mkdir(exist_ok=True, parents=True) + project_path = experiment_dir / 'lean_src_proj' + + print("Building src project ...") + while True: + # Lean sometimes fails to build. Try until it succeeds. + completion = subprocess.run(['lake', 'build'], cwd=project_path, check=False) + if completion.returncode == 0: + break; + print("Built src project") + # Start server - project_path, lean_path = get_project_and_lean_path() def server_func(): return Server( imports=["Mathlib", "Aesop"], project_path=project_path, - lean_path=lean_path, core_options=DEFAULT_CORE_OPTIONS, ) diff --git a/experiments/dsp/pack-artefact.sh b/experiments/dsp/pack-artefact.sh new file mode 100644 index 0000000..ac7db5c --- /dev/null +++ b/experiments/dsp/pack-artefact.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# Packs the artefact +# Execute this at the project root directory to produce `artefact.zip` + +echo "Packing Git archive" +git-archive-all artefact.zip +echo "Packing docker image" +docker image save chrysoberyl/pantograph > pantograph.tar +echo "Adding experimental results" +zip -ur artefact.zip result/ref-4o/{valid1,valid3,test1,test3} result/ref-o1/{valid1,valid3} pantograph.tar diff --git a/experiments/dsp/plot-4o.sh b/experiments/dsp/plot-4o.sh new file mode 100644 index 0000000..0ea3869 --- /dev/null +++ b/experiments/dsp/plot-4o.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh + +IMAGE=chrysoberyl/pantograph +BASE_DIR=/data/result/4o + +plot() { + docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/plot.py $@ +} +plot --result $BASE_DIR/{valid1,valid3,test1,test3} --names valid1 valid3 test1 test3 --plot-output $BASE_DIR/plot diff --git a/experiments/dsp/plot-o1.sh b/experiments/dsp/plot-o1.sh new file mode 100644 index 0000000..212a562 --- /dev/null +++ b/experiments/dsp/plot-o1.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh + +IMAGE=chrysoberyl/pantograph +BASE_DIR=/data/result/o1 + +plot() { + docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/plot.py $@ +} +plot --result $BASE_DIR/{valid1,test1} --names valid1 test1 --plot-output $BASE_DIR/plot diff --git a/experiments/dsp/ref-plots.sh b/experiments/dsp/ref-plots.sh new file mode 100644 index 0000000..d17a1da --- /dev/null +++ b/experiments/dsp/ref-plots.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env sh + +IMAGE=chrysoberyl/pantograph +plot() { + docker run --rm -it \ + --volume $PWD:/data \ + --entrypoint poetry \ + --env OPENAI_API_KEY=$OPENAI_API_KEY \ + $IMAGE run \ + python /data/experiments/dsp/plot.py $@ +} + +BASE_DIR=/data/result/ref-4o +plot --result $BASE_DIR/{valid1,test1,valid3,test3} --names valid1 test1 valid3 test3 --plot-output $BASE_DIR/plot +BASE_DIR=/data/result/ref-o1 +plot --result $BASE_DIR/{valid1,test1} --names valid1 test1 --plot-output $BASE_DIR/plot diff --git a/poetry.lock b/poetry.lock index 2a7ad49..20e0217 100644 --- a/poetry.lock +++ b/poetry.lock @@ -851,6 +851,17 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe, test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] tqdm = ["tqdm"] +[[package]] +name = "git-archive-all" +version = "1.23.1" +description = "Archive git repository with its submodules." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.6" +files = [ + {file = "git-archive-all-1.23.1.tar.gz", hash = "sha256:a42fe0cedd2e0361250a4f3130f3d3543f640d4f1fe28530771df5972108729f"}, + {file = "git_archive_all-1.23.1-py2.py3-none-any.whl", hash = "sha256:d9f9611f2df629de3df1d6f134955ced4c704cbc0e423d769a7c0e55a8484242"}, +] + [[package]] name = "gitdb" version = "4.0.11" @@ -4760,4 +4771,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "12935bf20e5ac4351734a47b1a71486daccbc0eed1239abe67858b3b17df8b37" +content-hash = "1ede574e6f55d5ed07f6064fd08c745bc6f3b0d8a22d87a9f7ad7ef4418ccb47" diff --git a/pyproject.toml b/pyproject.toml index ef55b3c..84e3f50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ termcolor = "^2.4.0" matplotlib = "^3.9.2" seaborn = "^0.13.2" pandas = "^2.2.3" +git-archive-all = "^1.23.1" [tool.poetry.group.doc] optional = true