stanford-centaur · lenianiva · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,7 @@
+.github
+dist
+docs
+examples
+experiments
+result
+*.tar
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,15 @@
 .*
 !.gitignore
 !.github
+!.dockerignore
 
 # Python
 *.py[cod]
 *.egg-info
 
 # Output
+*.zip
+*.tar
 /dist
 /venv
+/result
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:24.10
+
+# Install Python and Poetry
+RUN apt update &&\
+	apt install -y python3 curl git &&\
+	curl -sSL https://install.python-poetry.org | python3 -
+
+# Install Lean
+RUN curl -sSfL https://github.com/leanprover/elan/releases/download/v3.1.1/elan-x86_64-unknown-linux-gnu.tar.gz | tar xz &&\
+	./elan-init -y --default-toolchain none
+ENV PATH="$PATH:/root/.local/bin:/root/.elan/bin"
+
+COPY . /root/pantograph
+WORKDIR /root/pantograph
+
+RUN poetry build &&\
+	poetry install
diff --git a/experiments/ARTEFACT.md b/experiments/ARTEFACT.md
@@ -0,0 +1,28 @@
+# Artefact Packing
+
+This is about how to pack the artefacts described in `README.md`.
+
+## Execute the DSP experiments
+
+Run the DSP experiments from the project root,
+```sh
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/valid.jsonl --format minif2f --output result/ref-4o/valid1
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/valid.jsonl --format minif2f --n-samples 3 --output result/ref-4o/valid3
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/test.jsonl --format minif2f --output result/ref-4o/test1
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/test.jsonl --format minif2f --n-samples 3 --output result/ref-4o/test3
+
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/valid.jsonl --format minif2f --model o1-preview --output result/ref-o1/valid1
+python experiments/dsp/main.py eval \
+	--dataset experiments/minif2f/test.jsonl --format minif2f --model o1-preview --output result/ref-o1/test1
+```
+Then, pack it with the script
+
+```sh
+bash experiments/dsp/pack-artefact.sh
+```
+
diff --git a/experiments/README.md b/experiments/README.md
@@ -0,0 +1,85 @@
+# Experiments
+
+Unpack the `artefact.zip` file. Then cd into the unpacked directory and execute
+the commands below.
+
+## Environment
+
+The experiments require
+- Internet access
+- At least 15G of disk space on the drive hosting this repository
+- An [OpenAI API](https://openai.com/index/openai-api/) key, for invoking the models.
+
+Set the `OPENAI_API_KEY` environment variable to the API key.
+
+### Building/Loading the container
+
+There are 3 ways to load the container. Choose one.
+- (Artefact reviewer) If you **have** the docker image:
+``` sh
+docker load --input pantograph.tar
+```
+- Use the docker repository
+``` sh
+docker pull chrysoberyl/pantograph
+```
+- Build the docker image from scratch
+``` sh
+docker build . --tag chrysoberyl/pantograph
+```
+
+The docker image's name **must be** `chrysoberyl/pantograph` for the shell
+scripts below to work.
+
+## Experiments
+
+The experiments are bound by limitations of OpenAI. Since OpenAI as a commercial
+company cannot indefinitely store all snapshots of their models, the experiments
+rely on OpenAI's provided version of `o1-preview` and `gpt-4o` models. This may
+impact the reproducibility of the experiments in the future.
+
+### Early Evaluation
+
+Due to the nature of this project and how Lean's build system works, the first
+pass needs to build the Mathlib library. This building of the library will take
+about 30 minutes to run.
+
+Execute in the project root directory
+``` sh
+experiments/dsp/early.sh
+```
+the results will be written to `result/`
+
+All experiments are cached, so if they crash halfway (which really should not
+happen) you can continue to run them and they'll pick up from where they left
+off. Hence you can delete the relevant folders in `result/` to force an
+experiment to restart from the beginning.
+
+### Plots for the paper
+
+To generate the plots for the paper, execute
+
+``` sh
+bash experiments/dsp/ref-plots.sh
+```
+
+which will output the plots in `result/ref-{o1,4o}/plot` based on the provided
+experiment result data in `result/ref-*`
+
+### GPT-4o experiment
+
+``` sh
+bash experiments/dsp/all-4o.sh
+bash experiments/dsp/plot-4o.sh
+```
+
+which will output the plots in `result/4o/plot`
+
+### o1-preview experiment
+
+``` sh
+bash experiments/dsp/all-o1.sh
+bash experiments/dsp/plot-o1.sh
+```
+
+which will output the plots in `result/o1/plot`
diff --git a/experiments/dsp/all-4o.sh b/experiments/dsp/all-4o.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+IMAGE=chrysoberyl/pantograph
+BASE_DIR=/data/result/4o
+main() {
+	docker run --rm -it \
+		--volume $PWD:/data \
+		--entrypoint poetry \
+		--env OPENAI_API_KEY=$OPENAI_API_KEY \
+		$IMAGE run \
+		python /data/experiments/dsp/main.py eval $@
+}
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --format minif2f --output $BASE_DIR/valid1
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --n-samples 3 --format minif2f --output $BASE_DIR/valid3
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --format minif2f --output $BASE_DIR/test1
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --n-samples 3 --format minif2f --output $BASE_DIR/test3
diff --git a/experiments/dsp/all-o1.sh b/experiments/dsp/all-o1.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+IMAGE=chrysoberyl/pantograph
+BASE_DIR=/data/result/o1
+
+main() {
+	docker run --rm -it \
+		--volume $PWD:/data \
+		--entrypoint poetry \
+		--env OPENAI_API_KEY=$OPENAI_API_KEY \
+		$IMAGE run \
+		python /data/experiments/dsp/main.py eval $@
+}
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/valid.jsonl --format minif2f --model o1-preview --output $BASE_DIR/valid1
+main --dataset /data/experiments/dsp/ --dataset /data/experiments/minif2f/test.jsonl --format minif2f --model o1-preview --output $BASE_DIR/test1
diff --git a/experiments/dsp/early.sh b/experiments/dsp/early.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+IMAGE=chrysoberyl/pantograph
+
+docker run --rm -it \
+	--volume $PWD:/data \
+	--entrypoint poetry \
+	--env OPENAI_API_KEY=$OPENAI_API_KEY \
+	$IMAGE run \
+	python /data/experiments/dsp/main.py eval --output /data/result/debug
diff --git a/experiments/dsp/lean_src_proj/LeanSrcProj.lean b/experiments/dsp/lean_src_proj/LeanSrcProj.lean
@@ -1 +1,2 @@
-def hello := "world"
+import Mathlib
+import Aesop
diff --git a/experiments/dsp/main.py b/experiments/dsp/main.py
@@ -1,4 +1,5 @@
 import sys, os, json, subprocess, time, datetime
+from abc import abstractmethod
 from pathlib import Path
 from dataclasses import asdict
 from typing import Union, Any, Tuple, Optional
@@ -39,6 +40,13 @@ def __init__(self):
     def __call__(self, *args, **kwards):
         pass
 
+    @abstractmethod
+    def sample_draft(self, prompt: str):
+        pass
+    @abstractmethod
+    def sample_sketch(self, prompt: str):
+        pass
+
 class OpenAI_DSP_Engine(Engine):
     def __init__(
             self,
@@ -88,6 +96,7 @@ def role_prompt(self) -> str:
 
     def sample_draft(self, prompt: str):
         extra = {} if self.model.startswith("o1") else dict(
+            seed=0,
             temperature=self.draft_sampling_params.temperature,
             top_p=self.draft_sampling_params.top_p,
             stop=self.draft_sampling_params.stop[:3],
@@ -103,6 +112,7 @@ def sample_draft(self, prompt: str):
         )
     def sample_sketch(self, prompt: str):
         extra = {} if self.model.startswith("o1") else dict(
+            seed=0,
             temperature=self.sketch_sampling_params.temperature,
             top_p=self.sketch_sampling_params.top_p,
         )
@@ -255,14 +265,20 @@ def single_proof_search_dsp_lean(
     ) -> DatumResult:
 
     start_time = time.time()
-    # -- Draft: [y_nl_pred_draft]_n ~ draft(eng, x_nl_prob, P_draft)
-    y_nl_pred_drafts = step_draft(eng, datum)
-
-    # -- Sketch: z_fl_pred_sketch ~ sketch(eng, x_nl_prob, [y_nl_pred_draft]_n, x_fl_prob, P_sketch)
-    z_fl_pred_sketches, x_fl_prob = step_sketch(eng, datum, y_nl_pred_drafts)
+    try:
+        # -- Draft: [y_nl_pred_draft]_n ~ draft(eng, x_nl_prob, P_draft)
+        y_nl_pred_drafts = step_draft(eng, datum)
 
-    assert len(z_fl_pred_sketches) == eng.sketch_sampling_params.n
+        # -- Sketch: z_fl_pred_sketch ~ sketch(eng, x_nl_prob, [y_nl_pred_draft]_n, x_fl_prob, P_sketch)
+        z_fl_pred_sketches, x_fl_prob = step_sketch(eng, datum, y_nl_pred_drafts)
 
+        assert len(z_fl_pred_sketches) == eng.sketch_sampling_params.n
+    except Exception as e:
+        print(colored(f"Failed to create sketch/draft: {e}", "red"))
+        return DatumResult(
+            name=str(datum),
+            error=str(e),
+        )
 
     results = []
     success = False
@@ -273,17 +289,25 @@ def single_proof_search_dsp_lean(
         try:
             server = server_func()
         except Exception as e:
-            print(colored("Failed to create server: {e}", "red"))
+            print(colored(f"Failed to create server: {e}", "red"))
             return DatumResult(
                 name=str(datum),
                 error=str(e),
             )
-        # -- Prove: y_fl = prove(eng, x_fl_prob, z_fl_pred_sketches)
-        prove_result = step_prove(eng, server, x_fl_prob, sketch)
-        results.append(prove_result)
-        if isinstance(prove_result, SearchResult) and prove_result.success:
-            success = True
-            break
+        try:
+            # -- Prove: y_fl = prove(eng, x_fl_prob, z_fl_pred_sketches)
+            prove_result = step_prove(eng, server, x_fl_prob, sketch)
+            results.append(prove_result)
+            if isinstance(prove_result, SearchResult) and prove_result.success:
+                success = True
+                break
+        except Exception as e:
+            print(colored(f"Search failed: {e}", "red"))
+            return DatumResult(
+                name=str(datum),
+                error=str(e),
+            )
+
     duration = time.time() - start_time
 
     return DatumResult(
@@ -329,11 +353,6 @@ def full_proof_search_dsp_lean(
 
 experiment_dir = Path(__file__).resolve().parent
 
-def get_project_and_lean_path():
-    cwd = experiment_dir / 'lean_src_proj'
-    p = subprocess.check_output(['lake', 'env', 'printenv', 'LEAN_PATH'], cwd=cwd)
-    return cwd, p
-
 def load_data(args) -> list[Datum]:
     p = Path(args.dataset).expanduser()
     data = None
@@ -363,13 +382,21 @@ def main(args):
     path_output = Path(args.output)
     path_output.mkdir(exist_ok=True, parents=True)
 
+    project_path = experiment_dir / 'lean_src_proj'
+
+    print("Building src project ...")
+    while True:
+        # Lean sometimes fails to build. Try until it succeeds.
+        completion = subprocess.run(['lake', 'build'], cwd=project_path, check=False)
+        if completion.returncode == 0:
+            break;
+    print("Built src project")
+
     # Start server
-    project_path, lean_path = get_project_and_lean_path()
     def server_func():
         return Server(
             imports=["Mathlib", "Aesop"],
             project_path=project_path,
-            lean_path=lean_path,
             core_options=DEFAULT_CORE_OPTIONS,
         )
 

diff --git a/experiments/dsp/pack-artefact.sh b/experiments/dsp/pack-artefact.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Packs the artefact
+# Execute this at the project root directory to produce `artefact.zip`
+
+echo "Packing Git archive"
+git-archive-all artefact.zip
+echo "Packing docker image"
+docker image save chrysoberyl/pantograph > pantograph.tar
+echo "Adding experimental results"
+zip -ur artefact.zip result/ref-4o/{valid1,valid3,test1,test3} result/ref-o1/{valid1,valid3} pantograph.tar
diff --git a/experiments/dsp/plot-4o.sh b/experiments/dsp/plot-4o.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env sh
+
+IMAGE=chrysoberyl/pantograph
+BASE_DIR=/data/result/4o
+
+plot() {
+	docker run --rm -it \
+		--volume $PWD:/data \
+		--entrypoint poetry \
+		--env OPENAI_API_KEY=$OPENAI_API_KEY \
+		$IMAGE run \
+		python /data/experiments/dsp/plot.py $@
+}
+plot --result $BASE_DIR/{valid1,valid3,test1,test3} --names valid1 valid3 test1 test3 --plot-output $BASE_DIR/plot
diff --git a/experiments/dsp/plot-o1.sh b/experiments/dsp/plot-o1.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env sh
+
+IMAGE=chrysoberyl/pantograph
+BASE_DIR=/data/result/o1
+
+plot() {
+	docker run --rm -it \
+		--volume $PWD:/data \
+		--entrypoint poetry \
+		--env OPENAI_API_KEY=$OPENAI_API_KEY \
+		$IMAGE run \
+		python /data/experiments/dsp/plot.py $@
+}
+plot --result $BASE_DIR/{valid1,test1} --names valid1 test1 --plot-output $BASE_DIR/plot
diff --git a/experiments/dsp/ref-plots.sh b/experiments/dsp/ref-plots.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env sh
+
+IMAGE=chrysoberyl/pantograph
+plot() {
+	docker run --rm -it \
+		--volume $PWD:/data \
+		--entrypoint poetry \
+		--env OPENAI_API_KEY=$OPENAI_API_KEY \
+		$IMAGE run \
+		python /data/experiments/dsp/plot.py $@
+}
+
+BASE_DIR=/data/result/ref-4o
+plot --result $BASE_DIR/{valid1,test1,valid3,test3} --names valid1 test1 valid3 test3 --plot-output $BASE_DIR/plot
+BASE_DIR=/data/result/ref-o1
+plot --result $BASE_DIR/{valid1,test1} --names valid1 test1 --plot-output $BASE_DIR/plot