Merge pull request SWE-bench#178 from princeton-nlp/add-schema-version

john-b-yang · web-flow · commit d99c1c458803 · 2024-07-12T11:03:05.000-04:00
Add schema version to report card
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
@@ -357,7 +357,7 @@ def make_run_report(
         full_dataset: list,
         client: docker.DockerClient,
         run_id: str
-    ):
+    ) -> Path:
     """
     Make a final evaluation and run report of the instances that have been run.
     Also reports on images and containers that may still running!
@@ -367,6 +367,9 @@ def make_run_report(
         full_dataset (list): List of all instances
         client (docker.DockerClient): Docker client
         run_id (str): Run ID
+    
+    Returns:
+        Path to report file
     """
     # instantiate sets to store IDs of different outcomes
     completed_ids = set()
@@ -453,6 +456,7 @@ def make_run_report(
         "error_ids": list(sorted(error_ids)),
         "unstopped_containers": list(sorted(unstopped_containers)),
         "unremoved_images": list(sorted(unremoved_images)),
+        "schema_version": 2,
     }
     report_file = Path(
         list(predictions.values())[0]["model_name_or_path"].replace("/", "__")
@@ -462,6 +466,7 @@ def make_run_report(
     with open(report_file, "w") as f:
         print(json.dumps(report, indent=4), file=f)
     print(f"Report written to {report_file}")
+    return report_file
 
 
 def get_gold_predictions(dataset_name: str, split: str):
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
@@ -0,0 +1,24 @@
+import collections
+import json
+import docker
+
+from swebench.harness.run_evaluation import make_run_report
+
+TEST_INSTANCE = collections.defaultdict(lambda: "test")
+TEST_INSTANCE["PASS_TO_PASS"] = '[]'
+TEST_INSTANCE["repo"] = 'pvlib/pvlib-python'
+TEST_INSTANCE["version"] = '0.1'
+TEST_INSTANCE["FAIL_TO_PASS"] = '[]'
+
+def test_make_run_report(tmpdir) -> None:
+    client = docker.from_env()
+    with tmpdir.as_cwd():
+        output_path = make_run_report(
+            {"test": {"instance_id": "test", "model_name_or_path": "test"}},
+            [TEST_INSTANCE],
+            client,
+            "test"
+        )
+        assert output_path.is_file()
+        report = json.loads(output_path.read_text())
+        assert report["schema_version"] == 2