From 6dba8b022feb57296216764f0407a5d97a8da993 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:02:12 -0800
Subject: [PATCH 01/20] validate_patched_files_syntax

---
 utils/diff.py | 90 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/utils/diff.py b/utils/diff.py
index 216d92175..8599437df 100644
--- a/utils/diff.py
+++ b/utils/diff.py
@@ -1,5 +1,6 @@
 """Utilities for computing diffs between files."""
 
+import ast
 import os
 import tempfile
 import subprocess
@@ -8,15 +9,14 @@
 from typing import Tuple, Optional
 
 
-
 def get_file_diff(old_path, new_path) -> str:
     """
     Gets the diff between two files.
-    
+
     Args:
         old_path: The path to the old file
         new_path: The path to the new file
-        
+
     Returns:
         The diff between the two files, expressed as a diff of the old file, as a string.
     """
@@ -28,13 +28,9 @@ def get_file_diff(old_path, new_path) -> str:
         missing.append(new_path)
     if missing:
         logger.fatal(f"File(s) not found for diff: {', '.join(missing)}")
-    
+
     # Use diff command
-    result = subprocess.run(
-        ["diff", "-u", old_path, new_path],
-        capture_output=True,
-        text=True
-    )
+    result = subprocess.run(["diff", "-u", old_path, new_path], capture_output=True, text=True)
 
     # Check if the diff was generated successfully
     # `diff -u` return codes:
@@ -53,39 +49,33 @@ def get_file_diff(old_path, new_path) -> str:
         filename = os.path.basename(old_path)
         lines[0] = f"--- {filename}"
         lines[1] = f"+++ {filename}"
-    
-    return "\n".join(lines)
 
+    return "\n".join(lines)
 
 
 def validate_diff_for_local_repo(diff, local_repo_dir) -> Tuple[bool, Optional[str]]:
     """
     Validates if a diff string is valid and can be applied to a local repository.
-    
+
     Args:
         diff: The diff string to validate
         local_repo_dir: The local repository directory
-        
+
     Returns:
         (is_valid: bool, error_message: Optional[str])
     """
-    
+
     # Write diff to temp file
     with tempfile.NamedTemporaryFile(mode="w", suffix=".diff", delete=False) as f:
         f.write(diff)
         diff_file = f.name
-    
+
     # Use `git apply --check` to validate without applying
-    result = subprocess.run(
-        ["git", "apply", "--check", diff_file],
-        cwd=local_repo_dir,
-        capture_output=True,
-        text=True
-    )
+    result = subprocess.run(["git", "apply", "--check", diff_file], cwd=local_repo_dir, capture_output=True, text=True)
 
     # Delete the temp file
     os.unlink(diff_file)
-    
+
     # Check if the diff was applied successfully
     if result.returncode == 0:
         return True, None
@@ -93,11 +83,10 @@ def validate_diff_for_local_repo(diff, local_repo_dir) -> Tuple[bool, Optional[s
         return False, result.stderr.strip()
 
 
-
 def apply_diff_to_local_repo(diff, local_repo_dir) -> None:
     """
     Applies a diff string to files in the source directory.
-    
+
     Args:
         diff: The diff string to apply
         local_repo_dir: The local repository directory
@@ -107,18 +96,55 @@ def apply_diff_to_local_repo(diff, local_repo_dir) -> None:
     with tempfile.NamedTemporaryFile(mode="w", suffix=".diff", delete=False) as f:
         f.write(diff)
         diff_file = f.name
-    
+
     # Use `git apply` to apply the diff
-    result = subprocess.run(
-        ["git", "apply", diff_file],
-        cwd=local_repo_dir,
-        capture_output=True,
-        text=True
-    )
+    result = subprocess.run(["git", "apply", diff_file], cwd=local_repo_dir, capture_output=True, text=True)
 
     # Delete the temp file
     os.unlink(diff_file)
 
     # Check if the diff was applied successfully
     if result.returncode != 0:
-        logger.fatal(f"Failed to apply diff to {local_repo_dir}: {result.stderr.strip()}")
\ No newline at end of file
+        logger.fatal(f"Failed to apply diff to {local_repo_dir}: {result.stderr.strip()}")
+
+
+def validate_patched_files_syntax(repo_dir: str) -> Tuple[bool, Optional[str]]:
+    """
+    After a patch has been applied, check that modified files have valid syntax.
+    Supports Python (.py) and JavaScript (.js, .mjs) files.
+
+    Args:
+        repo_dir: The repository directory where the patch was applied
+
+    Returns:
+        (is_valid: bool, error_message: Optional[str])
+    """
+    result = subprocess.run(["git", "diff", "--name-only"], cwd=repo_dir, capture_output=True, text=True)
+    modified_files = [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
+
+    errors = []
+    for filepath in modified_files:
+        full_path = os.path.join(repo_dir, filepath)
+        if not os.path.exists(full_path):
+            continue
+
+        if filepath.endswith(".py"):
+            try:
+                with open(full_path, "r") as f:
+                    source = f.read()
+                ast.parse(source, filename=filepath)
+            except SyntaxError as e:
+                errors.append(f"{filepath}:{e.lineno}: {e.msg}")
+
+        elif filepath.endswith((".js", ".mjs")):
+            with open(full_path, "r") as f:
+                source = f.read()
+            result = subprocess.run(
+                ["node", "--input-type=module", "--check"], input=source, capture_output=True, text=True
+            )
+            if result.returncode != 0:
+                errors.append(f"{filepath}: {result.stderr.strip()}")
+
+    if errors:
+        return False, "Patched files have syntax errors:\n" + "\n".join(errors)
+    return True, None

From 3e881467e943b6858417d97bfbd1625f1dddfef9 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:02:39 -0800
Subject: [PATCH 02/20] syntax check in polyglot suite

---
 evaluator/problem_suites/polyglot/polyglot_suite.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/evaluator/problem_suites/polyglot/polyglot_suite.py b/evaluator/problem_suites/polyglot/polyglot_suite.py
index bd9db98ac..1c8fa9d85 100644
--- a/evaluator/problem_suites/polyglot/polyglot_suite.py
+++ b/evaluator/problem_suites/polyglot/polyglot_suite.py
@@ -18,7 +18,7 @@
 from utils.git import init_local_repo_with_initial_commit
 from evaluator.sandbox.sandbox_manager import SandboxManager
 from evaluator.problem_suites.problem_suite import ProblemSuite, ProblemSuiteName
-from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo
+from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo, validate_patched_files_syntax
 
 
 
@@ -147,7 +147,13 @@ def _on_mount(temp_dir: str):
                 # Apply the patch
                 apply_diff_to_local_repo(patch, sandbox_repo_dir)
 
-
+                # Syntax-check the patched files
+                is_valid, error_message = validate_patched_files_syntax(sandbox_repo_dir)
+                if not is_valid:
+                    raise EvaluationRunException(
+                        EvaluationRunErrorCode.AGENT_INVALID_PATCH,
+                        f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
+                    )
 
             return sandbox_manager.initialize_sandbox(
                 name=f"eval-sandbox-{problem.name}-{evaluation_run_id}",

From f5d004f60f1693912e12e5b192ee184d10719d71 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:02:56 -0800
Subject: [PATCH 03/20] syntax check in swebench

---
 .../swebench_verified/swebench_verified_suite.py      | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py b/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py
index 8d3e4742c..900a210c5 100644
--- a/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py
+++ b/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py
@@ -11,7 +11,7 @@
 from pydantic import BaseModel
 from utils.docker import get_docker_client
 from typing import Any, Dict, List, Tuple, Optional
-from utils.diff import validate_diff_for_local_repo
+from utils.diff import validate_diff_for_local_repo, apply_diff_to_local_repo, validate_patched_files_syntax
 from evaluator.models import EvaluationRunException
 from swebench.harness.constants import SWEbenchInstance
 from utils.temp import create_temp_dir, delete_temp_dir
@@ -184,7 +184,14 @@ def initialize_eval_sandbox(
                     f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
                 )
 
-
+            # Syntax-check the patched files
+            apply_diff_to_local_repo(patch, temp_dir)
+            is_valid, error_message = validate_patched_files_syntax(temp_dir)
+            if not is_valid:
+                raise EvaluationRunException(
+                    EvaluationRunErrorCode.AGENT_INVALID_PATCH,
+                    f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
+                )
 
             swebench_instance = problem.userdata
 

From bc7f837236b4769fad7063515122f7801c2fb194 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:03:29 -0800
Subject: [PATCH 04/20] add dev validator

---
 utils/validator_hotkeys.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/validator_hotkeys.py b/utils/validator_hotkeys.py
index 79600af18..e2ee28191 100644
--- a/utils/validator_hotkeys.py
+++ b/utils/validator_hotkeys.py
@@ -17,7 +17,8 @@
     {"name": "Alex's Validator (1)",  "hotkey": "5HpMvcM593HmizCA3ARLNifxjPSLbN3M5RHYy4GiEqmB3x9n"},
     {"name": "Alex's Validator (2)",  "hotkey": "5HNpAXVzWaW4yD9UqH5sXFPt1gPFqNTViDy61NdiViyDQiTQ"},
     {"name": "Alex's Validator (3)",  "hotkey": "5GgqnYQ3QwnCcmxiGatXS3rrHGmkqU3cMSjQFSdLKHDmxyB6"},
-    {"name": "Shak's Validator",      "hotkey": "5F26aNVC3rZVNbH36DWdZzxPVH17iBNGD14Wtb4nQem742Q7"}
+    {"name": "Shak's Validator",      "hotkey": "5F26aNVC3rZVNbH36DWdZzxPVH17iBNGD14Wtb4nQem742Q7"},
+    {"name": "Abe's Validator",       "hotkey": "5G699LghHWA18yEPq8NpX9gYi8ZDM3fy2BJvSvYWqtt2DHGE"}
 ]
 
 def is_validator_hotkey_whitelisted(validator_hotkey: str) -> bool:

From 9d59eee18da8d576602e3184460e7e115fc140b1 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:03:40 -0800
Subject: [PATCH 05/20] add skipped status

---
 models/evaluation_run.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/models/evaluation_run.py b/models/evaluation_run.py
index 891c28f7c..923408f39 100644
--- a/models/evaluation_run.py
+++ b/models/evaluation_run.py
@@ -60,6 +60,7 @@ class EvaluationRunStatus(str, Enum):
     running_eval = 'running_eval'
     finished = 'finished'
     error = 'error'
+    skipped = 'skipped'
 
 
 

From e2c4b15cbed0de7a817257b28b7af340d00b79e6 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:04:10 -0800
Subject: [PATCH 06/20] cleanup_sandbox

---
 evaluator/sandbox/sandbox_manager.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/evaluator/sandbox/sandbox_manager.py b/evaluator/sandbox/sandbox_manager.py
index cbc5ee639..2e641dd87 100644
--- a/evaluator/sandbox/sandbox_manager.py
+++ b/evaluator/sandbox/sandbox_manager.py
@@ -160,6 +160,20 @@ def initialize_sandbox(
 
 
 
+    def cleanup_sandbox(self, sandbox: Sandbox):
+        """Clean up a sandbox's container and temp directory."""
+        try:
+            sandbox.container.stop()
+            sandbox.container.remove()
+        except Exception:
+            pass
+        try:
+            delete_temp_dir(sandbox.temp_dir)
+        except Exception:
+            pass
+
+
+
     def run_sandbox(
         self,
         sandbox: Sandbox

From e74052337a85ce43fe2a434d2fbf376a544f0dd0 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:05:21 -0800
Subject: [PATCH 07/20] update EvaluationRunStatus with skipped enum

---
 api/src/backend/postgres_schema.sql | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql
index ba39ffa29..34bba24d0 100644
--- a/api/src/backend/postgres_schema.sql
+++ b/api/src/backend/postgres_schema.sql
@@ -27,7 +27,8 @@ BEGIN
             'initializing_eval',
             'running_eval',
             'finished',
-            'error'
+            'error',
+            'skipped',
         );
     END IF;
 

From 96a42b59f82a4834d1576fc907f9ac1e83d30261 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:06:11 -0800
Subject: [PATCH 08/20] handle skipped status for existing dbs

---
 api/src/backend/postgres_schema.sql | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql
index 34bba24d0..9ff68d305 100644
--- a/api/src/backend/postgres_schema.sql
+++ b/api/src/backend/postgres_schema.sql
@@ -532,3 +532,14 @@ CREATE TRIGGER tr_refresh_agent_scores_unapproved_agent_ids
 AFTER INSERT OR UPDATE OR DELETE
 ON unapproved_agent_ids FOR EACH ROW
 EXECUTE PROCEDURE refresh_agent_scores();
+
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_enum
+        WHERE enumlabel = 'skipped'
+        AND enumtypid = (SELECT oid FROM pg_type WHERE typname = 'evaluationrunstatus')
+    ) THEN
+        ALTER TYPE EvaluationRunStatus ADD VALUE 'skipped';
+    END IF;
+END $$;

From 28cb5cfc640fd0040b84f09c5e756af43d43d3be Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:13:20 -0800
Subject: [PATCH 09/20] update evaluatiuons_hydrated view

---
 api/src/backend/postgres_schema.sql | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql
index 9ff68d305..8d87496eb 100644
--- a/api/src/backend/postgres_schema.sql
+++ b/api/src/backend/postgres_schema.sql
@@ -229,8 +229,9 @@ CREATE OR REPLACE VIEW evaluations_hydrated AS
 SELECT
     evaluations.*,
     (CASE
-         WHEN EVERY(erh.status = 'finished' OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
-         WHEN EVERY(erh.status IN ('finished', 'error')) THEN 'failure'
+         WHEN evaluations.evaluation_set_group IN ('screener_1', 'screener_2') AND bool_or(erh.error_code = 1040) THEN 'failure'
+         WHEN EVERY(erh.status IN ('finished', 'skipped') OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
+         WHEN EVERY(erh.status IN ('finished', 'error', 'skipped')) THEN 'failure'
          ELSE 'running'
         END)::EvaluationStatus AS status,
     COUNT(*) FILTER (WHERE erh.solved)::float / COUNT(*) AS score

From a042097ec5fe689cdf05e54fe8125071a02f6c44 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:13:52 -0800
Subject: [PATCH 10/20] add status evaluation logic in comments

---
 api/src/backend/postgres_schema.sql | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql
index 8d87496eb..69baf2165 100644
--- a/api/src/backend/postgres_schema.sql
+++ b/api/src/backend/postgres_schema.sql
@@ -225,6 +225,13 @@ FROM evaluation_runs;
 
 -- Second view: Evaluations hydrated view
 -- Evaluations with aggregated status and average score
+--
+-- Status logic:
+--   1. 'failure' — Syntax penalty: screener evaluation where any run hit AGENT_INVALID_PATCH (1040). Agent is penalized.
+--   2. 'success' — Clean completion: every run finished, was skipped, or errored with an agent-level error (1000-1999).
+--                   The evaluation infra worked; score the agent normally.
+--   3. 'failure' — Infra failure: all runs are done but at least one had a non-agent error (2000+). Re-queue.
+--   4. 'running' — Some runs are still in progress.
 CREATE OR REPLACE VIEW evaluations_hydrated AS
 SELECT
     evaluations.*,

From 1795b6ed16664bd8589c2cd1fcba760b85d0e7b5 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:15:56 -0800
Subject: [PATCH 11/20] ValidatorRequestEvaluationResponse now has
 pass_threshold for screeners

---
 api/endpoints/validator_models.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/api/endpoints/validator_models.py b/api/endpoints/validator_models.py
index 2ddda50bb..5a4398c4e 100644
--- a/api/endpoints/validator_models.py
+++ b/api/endpoints/validator_models.py
@@ -47,6 +47,7 @@ class ValidatorRequestEvaluationResponseEvaluationRun(BaseModel): # :(
 class ValidatorRequestEvaluationResponse(BaseModel):
     agent_code: str
     evaluation_runs: List[ValidatorRequestEvaluationResponseEvaluationRun]
+    pass_threshold: Optional[float] = None  # None for validators
 
 
 
@@ -74,6 +75,13 @@ class ValidatorUpdateEvaluationRunRequest(BaseModel):
 class ValidatorUpdateEvaluationRunResponse(BaseModel):
     pass
 
+# Models for new endpoint that mark a run as skipped when screener (vali) cancels
+class ValidatorSkipEvaluationRunRequest(BaseModel):
+    evaluation_run_id: UUID
+
+class ValidatorSkipEvaluationRunResponse(BaseModel):
+    pass
+
 
 
 class ValidatorDisconnectRequest(BaseModel):

From de02f7e57c1e65b84fbf9feeb0d776962de6480f Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:16:23 -0800
Subject: [PATCH 12/20] validator_request_evaluation should pass threshold to
 screeners

---
 api/endpoints/validator.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py
index 6e1adc87c..62c0bcf74 100644
--- a/api/endpoints/validator.py
+++ b/api/endpoints/validator.py
@@ -352,7 +352,14 @@ async def validator_request_evaluation(
     agent_code = await download_text_file_from_s3(f"{agent_id}/agent.py")
     evaluation_runs = [ValidatorRequestEvaluationResponseEvaluationRun(evaluation_run_id=evaluation_run.evaluation_run_id, problem_name=evaluation_run.problem_name) for evaluation_run in evaluation_runs]
 
-    return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs)
+    # Determine pass threshold for screeners (None for validators)
+    pass_threshold = None
+    if validator.current_agent.status == AgentStatus.screening_1:
+        pass_threshold = config.SCREENER_1_THRESHOLD
+    elif validator.current_agent.status == AgentStatus.screening_2:
+        pass_threshold = config.SCREENER_2_THRESHOLD
+
+    return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs, pass_threshold=pass_threshold)
 
 
 

From d3838d962a79a9961a9a87429b312767f0a04462 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:19:11 -0800
Subject: [PATCH 13/20] add /skip-evaluation-run endpoint

---
 api/endpoints/validator.py | 46 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py
index 62c0bcf74..a5546fb75 100644
--- a/api/endpoints/validator.py
+++ b/api/endpoints/validator.py
@@ -613,6 +613,52 @@ async def validator_disconnect(
 
 
 
+# /validator/skip-evaluation-run
+# Used to mark an evaluation run as skipped when a screener (validator) cancels the evaluation
+@router.post("/skip-evaluation-run")
+@handle_validator_http_exceptions
+async def validator_skip_evaluation_run(
+    request: ValidatorSkipEvaluationRunRequest,
+    validator: Validator = Depends(get_request_validator_with_lock)
+) -> ValidatorSkipEvaluationRunResponse:
+    """Mark an evaluation run as skipped (early termination)."""
+
+    if validator.current_evaluation_id is None:
+        raise HTTPException(
+            status_code=409,
+            detail="This validator is not currently running an evaluation, and therefore cannot skip an evaluation run."
+        )
+
+    evaluation_run = await get_evaluation_run_by_id(request.evaluation_run_id)
+
+    if evaluation_run is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Evaluation run with ID {request.evaluation_run_id} does not exist."
+        )
+
+    if evaluation_run.evaluation_id != validator.current_evaluation_id:
+        raise HTTPException(
+            status_code=403,
+            detail=f"The evaluation run with ID {request.evaluation_run_id} is not associated with the validator's current evaluation."
+        )
+
+    # Ensure evaluation is not terminal
+    if evaluation_run.status in (EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped):
+        logger.info(f"Validator '{validator.name}' skip-evaluation-run called on terminal run (status={evaluation_run.status})")
+        return ValidatorSkipEvaluationRunResponse()
+
+    evaluation_run.status = EvaluationRunStatus.skipped
+    evaluation_run.finished_or_errored_at = datetime.now(timezone.utc)
+    await update_evaluation_run_by_id(evaluation_run)
+
+    logger.info(f"Validator '{validator.name}' skipped an evaluation run")
+    logger.info(f"  Evaluation run ID: {request.evaluation_run_id}")
+
+    return ValidatorSkipEvaluationRunResponse()
+
+
+
 # /validator/finish-evaluation
 @router.post("/finish-evaluation")
 @handle_validator_http_exceptions

From df3d881288c244719688084bebfd32886abad14f Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:20:41 -0800
Subject: [PATCH 14/20] handle skipped in /finish-evaluation

---
 api/endpoints/validator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py
index a5546fb75..02997975a 100644
--- a/api/endpoints/validator.py
+++ b/api/endpoints/validator.py
@@ -674,12 +674,12 @@ async def validator_finish_evaluation(
             detail="This validator is not currently running an evaluation, and therefore cannot request to finish an evaluation."
         )
 
-    # Make sure that all evaluation runs have either finished or errored
+    # Make sure that all evaluation runs have either finished,errored, or skipped
     evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(validator.current_evaluation_id)
-    if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error] for evaluation_run in evaluation_runs):
+    if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped] for evaluation_run in evaluation_runs):
         raise HTTPException(
             status_code=409,
-            detail="Not all evaluation runs associated with the evaluation that this validator is currently running have either finished or errored. Did you forget to send an update-evaluation-run?"
+            detail="Not all evaluation runs associated with the evaluation that this validator is currently running have finished, errored, or been skipped. Did you forget to send an update-evaluation-run?"
         )
 
 

From 5b6dff5b7c50ddb4c3bb460c96dc897aa868ddf3 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:22:55 -0800
Subject: [PATCH 15/20] update handle_evaluation_if_finished

---
 api/endpoints/validator.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py
index 02997975a..d1e64e38f 100644
--- a/api/endpoints/validator.py
+++ b/api/endpoints/validator.py
@@ -22,7 +22,8 @@
     get_all_evaluation_runs_in_evaluation_id, create_evaluation_run_log, check_if_evaluation_run_logs_exist
 from models.agent import Agent, AgentStatus
 from models.evaluation import Evaluation, EvaluationStatus
-from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType
+from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType, EvaluationRunErrorCode
+from models.evaluation_set import EvaluationSetGroup
 from models.problem import ProblemTestResult
 from utils.bittensor import validate_signed_timestamp
 from utils.s3 import download_text_file_from_s3
@@ -772,4 +773,19 @@ async def handle_evaluation_if_finished(evaluation_id: UUID) -> None:
                 # raise ValueError(f"Invalid agent status: {agent.status}, this should never happen")
                 return
 
-        await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)
\ No newline at end of file
+        await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)
+
+    elif hydrated_evaluation.status == EvaluationStatus.failure:
+        if hydrated_evaluation.evaluation_set_group in (EvaluationSetGroup.screener_1, EvaluationSetGroup.screener_2):
+            evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(evaluation_id)
+            has_syntax_penalty = any(
+                run.error_code is not None and run.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH
+                for run in evaluation_runs
+            )
+
+            if has_syntax_penalty:
+                agent = await get_agent_by_id(hydrated_evaluation.agent_id)
+                if agent.status == AgentStatus.screening_1:
+                    await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_1)
+                elif agent.status == AgentStatus.screening_2:
+                    await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_2)

From b9bc7a0d23f9a7eaa8b4934035476b6e06a1f0fd Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:31:14 -0800
Subject: [PATCH 16/20] adds RunOutcome response struc

---
 validator/main.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/validator/main.py b/validator/main.py
index 1a542bcc5..436694b62 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -12,6 +12,7 @@
 import utils.logger as logger
 import validator.config as config
 
+from dataclasses import dataclass
 from typing import Any, Dict
 from api.endpoints.validator_models import *
 from models.problem import ProblemTestResultStatus
@@ -39,6 +40,13 @@
 sandbox_manager = None
 problem_suites = []
 
+# Result from a single evaluation run, 
+# Used to decide whether to cancel remaining runs
+@dataclass
+class RunOutcome:
+    solved: bool = False            # Did all tests pass?
+    had_syntax_error: bool = False  # Was there an AGENT_INVALID_PATCH error?
+
 
 
 # Disconnect from the Ridges platform (called when the program exits)

From 402ef925a09a08409cf6e77d10f0f4c0982d0dc9 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:42:56 -0800
Subject: [PATCH 17/20] add skip_evaluation_run in orchaestrator

---
 validator/main.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/validator/main.py b/validator/main.py
index 436694b62..1c5d1c172 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -107,6 +107,13 @@ async def update_evaluation_run(evaluation_run_id: UUID, problem_name: str, upda
     ), bearer_token=session_id, quiet=2)
 
 
+async def skip_evaluation_run(evaluation_run_id: UUID, problem_name: str):
+    logger.info(f"Skipping evaluation run {evaluation_run_id} for problem {problem_name} (early termination)...")
+
+    await post_ridges_platform("/validator/skip-evaluation-run", ValidatorSkipEvaluationRunRequest(
+        evaluation_run_id=evaluation_run_id
+    ), bearer_token=session_id, quiet=2)
+
 
 # Truncates a log if required
 def truncate_logs_if_required(log: str) -> str:

From 9e02718564a165b5df91fb0b3337265859f69bfb Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:43:41 -0800
Subject: [PATCH 18/20] sim run eval update

---
 validator/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/validator/main.py b/validator/main.py
index 1c5d1c172..f0265568c 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -124,7 +124,7 @@ def truncate_logs_if_required(log: str) -> str:
 
 
 # Simulate a run of an evaluation run, useful for testing, set SIMULATE_EVALUATION_RUNS=True in .env
-async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: str):
+async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: str) -> RunOutcome:
     logger.info(f"Starting simulated evaluation run {evaluation_run_id} for problem {problem_name}...")
 
 
@@ -158,6 +158,7 @@ async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: st
 
     
     logger.info(f"Finished simulated evaluation run {evaluation_run_id} for problem {problem_name}")
+    return RunOutcome(solved=True)
 
 
 

From 8de715bbcbdd6683c1c2c4cd11c8f6ac33c9bb64 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 12 Feb 2026 18:47:57 -0800
Subject: [PATCH 19/20] update _run_evaluation_run

- track syntax error outcome
- Track pass_threshold
- track tasks as they run
- Stop eval if threshold to pass is impossible + syntax error penalty
---
 validator/main.py | 95 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 9 deletions(-)

diff --git a/validator/main.py b/validator/main.py
index f0265568c..5401afd1a 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -163,7 +163,7 @@ async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: st
 
 
 # Run an evaluation run
-async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_code: str):
+async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_code: str) -> RunOutcome:
     try:
         # Figure out what problem suite this problem belongs to
         problem_suite = next((suite for suite in problem_suites if suite.has_problem_name(problem_name)), None)
@@ -174,7 +174,7 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_
                 "error_code": EvaluationRunErrorCode.VALIDATOR_UNKNOWN_PROBLEM.value,
                 "error_message": f"The problem '{problem_name}' was not found in any problem suite"
             })
-            return
+            return RunOutcome()
 
         # Get the problem
         problem = problem_suite.get_problem(problem_name)
@@ -183,7 +183,9 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_
 
         logger.info(f"Starting evaluation run {evaluation_run_id} for problem {problem_name}...")
 
-
+        outcome = RunOutcome()
+        agent_sandbox = None
+        eval_sandbox = None
 
         try:
             # Move from pending -> initializing_agent
@@ -247,6 +249,19 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_
                 "eval_logs": truncate_logs_if_required(eval_logs)
             })
 
+            outcome.solved = num_passed == len(test_results) and len(test_results) > 0
+
+        except asyncio.CancelledError:
+            logger.info(f"Evaluation run {evaluation_run_id} for problem {problem_name} cancelled; cleaning up sandboxes")
+
+            if sandbox_manager is not None:
+                if agent_sandbox is not None:
+                    await asyncio.shield(asyncio.to_thread(sandbox_manager.cleanup_sandbox, agent_sandbox))
+                if eval_sandbox is not None:
+                    await asyncio.shield(asyncio.to_thread(sandbox_manager.cleanup_sandbox, eval_sandbox))
+
+            raise
+
         except EvaluationRunException as e:
             logger.error(f"Evaluation run {evaluation_run_id} for problem {problem_name} errored: {e}")
 
@@ -255,6 +270,8 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_
                 "error_message": e.error_message
             })
 
+            outcome.had_syntax_error = (e.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH)
+
         except Exception as e:
             logger.error(f"Evaluation run {evaluation_run_id} for problem {problem_name} errored: {EvaluationRunErrorCode.VALIDATOR_INTERNAL_ERROR.get_error_message()}: {e}")
             logger.error(traceback.format_exc())
@@ -267,15 +284,23 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_
 
 
         logger.info(f"Finished evaluation run {evaluation_run_id} for problem {problem_name}")
+        return outcome
 
+    except asyncio.CancelledError:
+        raise
     except Exception as e:
         logger.error(f"Error in _run_evaluation_run(): {type(e).__name__}: {e}")
         logger.error(traceback.format_exc())
         os._exit(1)
+
+    return RunOutcome()
     
 
 
 # Run an evaluation, automatically dispatches all runs to either _simulate_run_evaluation_run or _run_evaluation_run
+# Terminate early when:
+# - Threshold is impossible
+# - Syntax error is detected
 async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluationResponse):
     logger.info("Received evaluation:")
     logger.info(f"  # of evaluation runs: {len(request_evaluation_response.evaluation_runs)}")
@@ -283,23 +308,75 @@ async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluatio
     for evaluation_run in request_evaluation_response.evaluation_runs:
         logger.info(f"    {evaluation_run.problem_name}")
 
-
+    pass_threshold = request_evaluation_response.pass_threshold
+    logger.info(f"  Pass threshold: {pass_threshold}")
 
     logger.info("Starting evaluation...")
 
-    tasks = []
+    task_to_run_info: Dict[asyncio.Task, ValidatorRequestEvaluationResponseEvaluationRun] = {}
+
     for evaluation_run in request_evaluation_response.evaluation_runs:
         evaluation_run_id = evaluation_run.evaluation_run_id
         problem_name = evaluation_run.problem_name
 
         if config.SIMULATE_EVALUATION_RUNS:
-            tasks.append(asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name)))
+            task = asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name))
         else:
-            tasks.append(asyncio.create_task(_run_evaluation_run(evaluation_run_id, problem_name, request_evaluation_response.agent_code)))
+            task = asyncio.create_task(_run_evaluation_run(evaluation_run_id, problem_name, request_evaluation_response.agent_code))
+
+        task_to_run_info[task] = evaluation_run
+
+    # Process tasks as they complete (check for early termination)
+    total = len(task_to_run_info)
+    solved_count = 0
+    completed_count = 0
+    pending = set(task_to_run_info.keys())
+    skip_reason = None
+
+    while pending:
+        done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED)
+
+        for task in done:
+            try:
+                outcome: RunOutcome = task.result()
+                completed_count += 1
+                if outcome.solved:
+                    solved_count += 1
+
+                # Only screeners have non-zero pass threshold
+                if pass_threshold is not None and skip_reason is None:
+                    remaining = total - completed_count
+
+                    if outcome.had_syntax_error:
+                        skip_reason = "syntax error penalty"
+                        logger.info(f"Early termination triggered - syntax error detected, skipping remaining {len(pending)} runs")
+
+                    elif remaining > 0 and (solved_count + remaining) / total < pass_threshold:
+                        skip_reason = "threshold impossible"
+                        logger.info(f"Early termination triggered - threshold impossible ({solved_count + remaining}/{total} < {pass_threshold}), skipping remaining {len(pending)} runs")
+
+            except asyncio.CancelledError:
+                pass
+            except Exception as e:
+                logger.error(f"Error getting result from task: {e}")
+                completed_count += 1
+
+        if skip_reason is not None and pending:
+            for pending_task in pending:
+                pending_task.cancel()
+
+            await asyncio.wait(pending)
+
+            for pending_task in pending:
+                run_info = task_to_run_info[pending_task]
+                try:
+                    await skip_evaluation_run(run_info.evaluation_run_id, run_info.problem_name)
+                except Exception as e:
+                    logger.error(f"Error skipping evaluation run {run_info.evaluation_run_id}: {e}")
 
-    await asyncio.gather(*tasks)
+            pending = set()
 
-    logger.info("Finished evaluation")
+    logger.info(f"Finished evaluation (solved={solved_count}/{total}, skip_reason={skip_reason})")
 
     await post_ridges_platform("/validator/finish-evaluation", ValidatorFinishEvaluationRequest(), bearer_token=session_id, quiet=1)
 

From 5fe205894857497bc452fac8102312d49061fc00 Mon Sep 17 00:00:00 2001
From: ibraheem-latent <i@latent.to>
Date: Thu, 19 Feb 2026 10:11:48 -0800
Subject: [PATCH 20/20] use semaphore

---
 validator/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/validator/main.py b/validator/main.py
index 64fc50be2..86512d866 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -328,7 +328,7 @@ async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluatio
         problem_name = evaluation_run.problem_name
 
         if config.SIMULATE_EVALUATION_RUNS:
-            task = asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name))
+            task = asyncio.create_task(_simulate_run_evaluation_run_with_semaphore(evaluation_run_id, problem_name, semaphore))
         else:
             task = asyncio.create_task(_run_evaluation_run_with_semaphore(evaluation_run_id, problem_name, request_evaluation_response.agent_code, semaphore))