From 6dba8b022feb57296216764f0407a5d97a8da993 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:02:12 -0800 Subject: [PATCH 01/20] validate_patched_files_syntax --- utils/diff.py | 90 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 32 deletions(-) diff --git a/utils/diff.py b/utils/diff.py index 216d92175..8599437df 100644 --- a/utils/diff.py +++ b/utils/diff.py @@ -1,5 +1,6 @@ """Utilities for computing diffs between files.""" +import ast import os import tempfile import subprocess @@ -8,15 +9,14 @@ from typing import Tuple, Optional - def get_file_diff(old_path, new_path) -> str: """ Gets the diff between two files. - + Args: old_path: The path to the old file new_path: The path to the new file - + Returns: The diff between the two files, expressed as a diff of the old file, as a string. """ @@ -28,13 +28,9 @@ def get_file_diff(old_path, new_path) -> str: missing.append(new_path) if missing: logger.fatal(f"File(s) not found for diff: {', '.join(missing)}") - + # Use diff command - result = subprocess.run( - ["diff", "-u", old_path, new_path], - capture_output=True, - text=True - ) + result = subprocess.run(["diff", "-u", old_path, new_path], capture_output=True, text=True) # Check if the diff was generated successfully # `diff -u` return codes: @@ -53,39 +49,33 @@ def get_file_diff(old_path, new_path) -> str: filename = os.path.basename(old_path) lines[0] = f"--- {filename}" lines[1] = f"+++ {filename}" - - return "\n".join(lines) + return "\n".join(lines) def validate_diff_for_local_repo(diff, local_repo_dir) -> Tuple[bool, Optional[str]]: """ Validates if a diff string is valid and can be applied to a local repository. - + Args: diff: The diff string to validate local_repo_dir: The local repository directory - + Returns: (is_valid: bool, error_message: Optional[str]) """ - + # Write diff to temp file with tempfile.NamedTemporaryFile(mode="w", suffix=".diff", delete=False) as f: f.write(diff) diff_file = f.name - + # Use `git apply --check` to validate without applying - result = subprocess.run( - ["git", "apply", "--check", diff_file], - cwd=local_repo_dir, - capture_output=True, - text=True - ) + result = subprocess.run(["git", "apply", "--check", diff_file], cwd=local_repo_dir, capture_output=True, text=True) # Delete the temp file os.unlink(diff_file) - + # Check if the diff was applied successfully if result.returncode == 0: return True, None @@ -93,11 +83,10 @@ def validate_diff_for_local_repo(diff, local_repo_dir) -> Tuple[bool, Optional[s return False, result.stderr.strip() - def apply_diff_to_local_repo(diff, local_repo_dir) -> None: """ Applies a diff string to files in the source directory. - + Args: diff: The diff string to apply local_repo_dir: The local repository directory @@ -107,18 +96,55 @@ def apply_diff_to_local_repo(diff, local_repo_dir) -> None: with tempfile.NamedTemporaryFile(mode="w", suffix=".diff", delete=False) as f: f.write(diff) diff_file = f.name - + # Use `git apply` to apply the diff - result = subprocess.run( - ["git", "apply", diff_file], - cwd=local_repo_dir, - capture_output=True, - text=True - ) + result = subprocess.run(["git", "apply", diff_file], cwd=local_repo_dir, capture_output=True, text=True) # Delete the temp file os.unlink(diff_file) # Check if the diff was applied successfully if result.returncode != 0: - logger.fatal(f"Failed to apply diff to {local_repo_dir}: {result.stderr.strip()}") \ No newline at end of file + logger.fatal(f"Failed to apply diff to {local_repo_dir}: {result.stderr.strip()}") + + +def validate_patched_files_syntax(repo_dir: str) -> Tuple[bool, Optional[str]]: + """ + After a patch has been applied, check that modified files have valid syntax. + Supports Python (.py) and JavaScript (.js, .mjs) files. + + Args: + repo_dir: The repository directory where the patch was applied + + Returns: + (is_valid: bool, error_message: Optional[str]) + """ + result = subprocess.run(["git", "diff", "--name-only"], cwd=repo_dir, capture_output=True, text=True) + modified_files = [f.strip() for f in result.stdout.strip().splitlines() if f.strip()] + + errors = [] + for filepath in modified_files: + full_path = os.path.join(repo_dir, filepath) + if not os.path.exists(full_path): + continue + + if filepath.endswith(".py"): + try: + with open(full_path, "r") as f: + source = f.read() + ast.parse(source, filename=filepath) + except SyntaxError as e: + errors.append(f"{filepath}:{e.lineno}: {e.msg}") + + elif filepath.endswith((".js", ".mjs")): + with open(full_path, "r") as f: + source = f.read() + result = subprocess.run( + ["node", "--input-type=module", "--check"], input=source, capture_output=True, text=True + ) + if result.returncode != 0: + errors.append(f"{filepath}: {result.stderr.strip()}") + + if errors: + return False, "Patched files have syntax errors:\n" + "\n".join(errors) + return True, None From 3e881467e943b6858417d97bfbd1625f1dddfef9 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:02:39 -0800 Subject: [PATCH 02/20] syntax check in polyglot suite --- evaluator/problem_suites/polyglot/polyglot_suite.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/evaluator/problem_suites/polyglot/polyglot_suite.py b/evaluator/problem_suites/polyglot/polyglot_suite.py index bd9db98ac..1c8fa9d85 100644 --- a/evaluator/problem_suites/polyglot/polyglot_suite.py +++ b/evaluator/problem_suites/polyglot/polyglot_suite.py @@ -18,7 +18,7 @@ from utils.git import init_local_repo_with_initial_commit from evaluator.sandbox.sandbox_manager import SandboxManager from evaluator.problem_suites.problem_suite import ProblemSuite, ProblemSuiteName -from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo +from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo, validate_patched_files_syntax @@ -147,7 +147,13 @@ def _on_mount(temp_dir: str): # Apply the patch apply_diff_to_local_repo(patch, sandbox_repo_dir) - + # Syntax-check the patched files + is_valid, error_message = validate_patched_files_syntax(sandbox_repo_dir) + if not is_valid: + raise EvaluationRunException( + EvaluationRunErrorCode.AGENT_INVALID_PATCH, + f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}" + ) return sandbox_manager.initialize_sandbox( name=f"eval-sandbox-{problem.name}-{evaluation_run_id}", From f5d004f60f1693912e12e5b192ee184d10719d71 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:02:56 -0800 Subject: [PATCH 03/20] syntax check in swebench --- .../swebench_verified/swebench_verified_suite.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py b/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py index 8d3e4742c..900a210c5 100644 --- a/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py +++ b/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py @@ -11,7 +11,7 @@ from pydantic import BaseModel from utils.docker import get_docker_client from typing import Any, Dict, List, Tuple, Optional -from utils.diff import validate_diff_for_local_repo +from utils.diff import validate_diff_for_local_repo, apply_diff_to_local_repo, validate_patched_files_syntax from evaluator.models import EvaluationRunException from swebench.harness.constants import SWEbenchInstance from utils.temp import create_temp_dir, delete_temp_dir @@ -184,7 +184,14 @@ def initialize_eval_sandbox( f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}" ) - + # Syntax-check the patched files + apply_diff_to_local_repo(patch, temp_dir) + is_valid, error_message = validate_patched_files_syntax(temp_dir) + if not is_valid: + raise EvaluationRunException( + EvaluationRunErrorCode.AGENT_INVALID_PATCH, + f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}" + ) swebench_instance = problem.userdata From bc7f837236b4769fad7063515122f7801c2fb194 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:03:29 -0800 Subject: [PATCH 04/20] add dev validator --- utils/validator_hotkeys.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/validator_hotkeys.py b/utils/validator_hotkeys.py index 79600af18..e2ee28191 100644 --- a/utils/validator_hotkeys.py +++ b/utils/validator_hotkeys.py @@ -17,7 +17,8 @@ {"name": "Alex's Validator (1)", "hotkey": "5HpMvcM593HmizCA3ARLNifxjPSLbN3M5RHYy4GiEqmB3x9n"}, {"name": "Alex's Validator (2)", "hotkey": "5HNpAXVzWaW4yD9UqH5sXFPt1gPFqNTViDy61NdiViyDQiTQ"}, {"name": "Alex's Validator (3)", "hotkey": "5GgqnYQ3QwnCcmxiGatXS3rrHGmkqU3cMSjQFSdLKHDmxyB6"}, - {"name": "Shak's Validator", "hotkey": "5F26aNVC3rZVNbH36DWdZzxPVH17iBNGD14Wtb4nQem742Q7"} + {"name": "Shak's Validator", "hotkey": "5F26aNVC3rZVNbH36DWdZzxPVH17iBNGD14Wtb4nQem742Q7"}, + {"name": "Abe's Validator", "hotkey": "5G699LghHWA18yEPq8NpX9gYi8ZDM3fy2BJvSvYWqtt2DHGE"} ] def is_validator_hotkey_whitelisted(validator_hotkey: str) -> bool: From 9d59eee18da8d576602e3184460e7e115fc140b1 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:03:40 -0800 Subject: [PATCH 05/20] add skipped status --- models/evaluation_run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/models/evaluation_run.py b/models/evaluation_run.py index 891c28f7c..923408f39 100644 --- a/models/evaluation_run.py +++ b/models/evaluation_run.py @@ -60,6 +60,7 @@ class EvaluationRunStatus(str, Enum): running_eval = 'running_eval' finished = 'finished' error = 'error' + skipped = 'skipped' From e2c4b15cbed0de7a817257b28b7af340d00b79e6 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:04:10 -0800 Subject: [PATCH 06/20] cleanup_sandbox --- evaluator/sandbox/sandbox_manager.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/evaluator/sandbox/sandbox_manager.py b/evaluator/sandbox/sandbox_manager.py index cbc5ee639..2e641dd87 100644 --- a/evaluator/sandbox/sandbox_manager.py +++ b/evaluator/sandbox/sandbox_manager.py @@ -160,6 +160,20 @@ def initialize_sandbox( + def cleanup_sandbox(self, sandbox: Sandbox): + """Clean up a sandbox's container and temp directory.""" + try: + sandbox.container.stop() + sandbox.container.remove() + except Exception: + pass + try: + delete_temp_dir(sandbox.temp_dir) + except Exception: + pass + + + def run_sandbox( self, sandbox: Sandbox From e74052337a85ce43fe2a434d2fbf376a544f0dd0 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:05:21 -0800 Subject: [PATCH 07/20] update EvaluationRunStatus with skipped enum --- api/src/backend/postgres_schema.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql index ba39ffa29..34bba24d0 100644 --- a/api/src/backend/postgres_schema.sql +++ b/api/src/backend/postgres_schema.sql @@ -27,7 +27,8 @@ BEGIN 'initializing_eval', 'running_eval', 'finished', - 'error' + 'error', + 'skipped', ); END IF; From 96a42b59f82a4834d1576fc907f9ac1e83d30261 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:06:11 -0800 Subject: [PATCH 08/20] handle skipped status for existing dbs --- api/src/backend/postgres_schema.sql | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql index 34bba24d0..9ff68d305 100644 --- a/api/src/backend/postgres_schema.sql +++ b/api/src/backend/postgres_schema.sql @@ -532,3 +532,14 @@ CREATE TRIGGER tr_refresh_agent_scores_unapproved_agent_ids AFTER INSERT OR UPDATE OR DELETE ON unapproved_agent_ids FOR EACH ROW EXECUTE PROCEDURE refresh_agent_scores(); + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_enum + WHERE enumlabel = 'skipped' + AND enumtypid = (SELECT oid FROM pg_type WHERE typname = 'evaluationrunstatus') + ) THEN + ALTER TYPE EvaluationRunStatus ADD VALUE 'skipped'; + END IF; +END $$; From 28cb5cfc640fd0040b84f09c5e756af43d43d3be Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:13:20 -0800 Subject: [PATCH 09/20] update evaluatiuons_hydrated view --- api/src/backend/postgres_schema.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql index 9ff68d305..8d87496eb 100644 --- a/api/src/backend/postgres_schema.sql +++ b/api/src/backend/postgres_schema.sql @@ -229,8 +229,9 @@ CREATE OR REPLACE VIEW evaluations_hydrated AS SELECT evaluations.*, (CASE - WHEN EVERY(erh.status = 'finished' OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success' - WHEN EVERY(erh.status IN ('finished', 'error')) THEN 'failure' + WHEN evaluations.evaluation_set_group IN ('screener_1', 'screener_2') AND bool_or(erh.error_code = 1040) THEN 'failure' + WHEN EVERY(erh.status IN ('finished', 'skipped') OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success' + WHEN EVERY(erh.status IN ('finished', 'error', 'skipped')) THEN 'failure' ELSE 'running' END)::EvaluationStatus AS status, COUNT(*) FILTER (WHERE erh.solved)::float / COUNT(*) AS score From a042097ec5fe689cdf05e54fe8125071a02f6c44 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:13:52 -0800 Subject: [PATCH 10/20] add status evaluation logic in comments --- api/src/backend/postgres_schema.sql | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql index 8d87496eb..69baf2165 100644 --- a/api/src/backend/postgres_schema.sql +++ b/api/src/backend/postgres_schema.sql @@ -225,6 +225,13 @@ FROM evaluation_runs; -- Second view: Evaluations hydrated view -- Evaluations with aggregated status and average score +-- +-- Status logic: +-- 1. 'failure' — Syntax penalty: screener evaluation where any run hit AGENT_INVALID_PATCH (1040). Agent is penalized. +-- 2. 'success' — Clean completion: every run finished, was skipped, or errored with an agent-level error (1000-1999). +-- The evaluation infra worked; score the agent normally. +-- 3. 'failure' — Infra failure: all runs are done but at least one had a non-agent error (2000+). Re-queue. +-- 4. 'running' — Some runs are still in progress. CREATE OR REPLACE VIEW evaluations_hydrated AS SELECT evaluations.*, From 1795b6ed16664bd8589c2cd1fcba760b85d0e7b5 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:15:56 -0800 Subject: [PATCH 11/20] ValidatorRequestEvaluationResponse now has pass_threshold for screeners --- api/endpoints/validator_models.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api/endpoints/validator_models.py b/api/endpoints/validator_models.py index 2ddda50bb..5a4398c4e 100644 --- a/api/endpoints/validator_models.py +++ b/api/endpoints/validator_models.py @@ -47,6 +47,7 @@ class ValidatorRequestEvaluationResponseEvaluationRun(BaseModel): # :( class ValidatorRequestEvaluationResponse(BaseModel): agent_code: str evaluation_runs: List[ValidatorRequestEvaluationResponseEvaluationRun] + pass_threshold: Optional[float] = None # None for validators @@ -74,6 +75,13 @@ class ValidatorUpdateEvaluationRunRequest(BaseModel): class ValidatorUpdateEvaluationRunResponse(BaseModel): pass +# Models for new endpoint that mark a run as skipped when screener (vali) cancels +class ValidatorSkipEvaluationRunRequest(BaseModel): + evaluation_run_id: UUID + +class ValidatorSkipEvaluationRunResponse(BaseModel): + pass + class ValidatorDisconnectRequest(BaseModel): From de02f7e57c1e65b84fbf9feeb0d776962de6480f Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:16:23 -0800 Subject: [PATCH 12/20] validator_request_evaluation should pass threshold to screeners --- api/endpoints/validator.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py index 6e1adc87c..62c0bcf74 100644 --- a/api/endpoints/validator.py +++ b/api/endpoints/validator.py @@ -352,7 +352,14 @@ async def validator_request_evaluation( agent_code = await download_text_file_from_s3(f"{agent_id}/agent.py") evaluation_runs = [ValidatorRequestEvaluationResponseEvaluationRun(evaluation_run_id=evaluation_run.evaluation_run_id, problem_name=evaluation_run.problem_name) for evaluation_run in evaluation_runs] - return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs) + # Determine pass threshold for screeners (None for validators) + pass_threshold = None + if validator.current_agent.status == AgentStatus.screening_1: + pass_threshold = config.SCREENER_1_THRESHOLD + elif validator.current_agent.status == AgentStatus.screening_2: + pass_threshold = config.SCREENER_2_THRESHOLD + + return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs, pass_threshold=pass_threshold) From d3838d962a79a9961a9a87429b312767f0a04462 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:19:11 -0800 Subject: [PATCH 13/20] add /skip-evaluation-run endpoint --- api/endpoints/validator.py | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py index 62c0bcf74..a5546fb75 100644 --- a/api/endpoints/validator.py +++ b/api/endpoints/validator.py @@ -613,6 +613,52 @@ async def validator_disconnect( +# /validator/skip-evaluation-run +# Used to mark an evaluation run as skipped when a screener (validator) cancels the evaluation +@router.post("/skip-evaluation-run") +@handle_validator_http_exceptions +async def validator_skip_evaluation_run( + request: ValidatorSkipEvaluationRunRequest, + validator: Validator = Depends(get_request_validator_with_lock) +) -> ValidatorSkipEvaluationRunResponse: + """Mark an evaluation run as skipped (early termination).""" + + if validator.current_evaluation_id is None: + raise HTTPException( + status_code=409, + detail="This validator is not currently running an evaluation, and therefore cannot skip an evaluation run." + ) + + evaluation_run = await get_evaluation_run_by_id(request.evaluation_run_id) + + if evaluation_run is None: + raise HTTPException( + status_code=404, + detail=f"Evaluation run with ID {request.evaluation_run_id} does not exist." + ) + + if evaluation_run.evaluation_id != validator.current_evaluation_id: + raise HTTPException( + status_code=403, + detail=f"The evaluation run with ID {request.evaluation_run_id} is not associated with the validator's current evaluation." + ) + + # Ensure evaluation is not terminal + if evaluation_run.status in (EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped): + logger.info(f"Validator '{validator.name}' skip-evaluation-run called on terminal run (status={evaluation_run.status})") + return ValidatorSkipEvaluationRunResponse() + + evaluation_run.status = EvaluationRunStatus.skipped + evaluation_run.finished_or_errored_at = datetime.now(timezone.utc) + await update_evaluation_run_by_id(evaluation_run) + + logger.info(f"Validator '{validator.name}' skipped an evaluation run") + logger.info(f" Evaluation run ID: {request.evaluation_run_id}") + + return ValidatorSkipEvaluationRunResponse() + + + # /validator/finish-evaluation @router.post("/finish-evaluation") @handle_validator_http_exceptions From df3d881288c244719688084bebfd32886abad14f Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:20:41 -0800 Subject: [PATCH 14/20] handle skipped in /finish-evaluation --- api/endpoints/validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py index a5546fb75..02997975a 100644 --- a/api/endpoints/validator.py +++ b/api/endpoints/validator.py @@ -674,12 +674,12 @@ async def validator_finish_evaluation( detail="This validator is not currently running an evaluation, and therefore cannot request to finish an evaluation." ) - # Make sure that all evaluation runs have either finished or errored + # Make sure that all evaluation runs have either finished,errored, or skipped evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(validator.current_evaluation_id) - if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error] for evaluation_run in evaluation_runs): + if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped] for evaluation_run in evaluation_runs): raise HTTPException( status_code=409, - detail="Not all evaluation runs associated with the evaluation that this validator is currently running have either finished or errored. Did you forget to send an update-evaluation-run?" + detail="Not all evaluation runs associated with the evaluation that this validator is currently running have finished, errored, or been skipped. Did you forget to send an update-evaluation-run?" ) From 5b6dff5b7c50ddb4c3bb460c96dc897aa868ddf3 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:22:55 -0800 Subject: [PATCH 15/20] update handle_evaluation_if_finished --- api/endpoints/validator.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py index 02997975a..d1e64e38f 100644 --- a/api/endpoints/validator.py +++ b/api/endpoints/validator.py @@ -22,7 +22,8 @@ get_all_evaluation_runs_in_evaluation_id, create_evaluation_run_log, check_if_evaluation_run_logs_exist from models.agent import Agent, AgentStatus from models.evaluation import Evaluation, EvaluationStatus -from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType +from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType, EvaluationRunErrorCode +from models.evaluation_set import EvaluationSetGroup from models.problem import ProblemTestResult from utils.bittensor import validate_signed_timestamp from utils.s3 import download_text_file_from_s3 @@ -772,4 +773,19 @@ async def handle_evaluation_if_finished(evaluation_id: UUID) -> None: # raise ValueError(f"Invalid agent status: {agent.status}, this should never happen") return - await update_agent_status(hydrated_evaluation.agent_id, new_agent_status) \ No newline at end of file + await update_agent_status(hydrated_evaluation.agent_id, new_agent_status) + + elif hydrated_evaluation.status == EvaluationStatus.failure: + if hydrated_evaluation.evaluation_set_group in (EvaluationSetGroup.screener_1, EvaluationSetGroup.screener_2): + evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(evaluation_id) + has_syntax_penalty = any( + run.error_code is not None and run.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH + for run in evaluation_runs + ) + + if has_syntax_penalty: + agent = await get_agent_by_id(hydrated_evaluation.agent_id) + if agent.status == AgentStatus.screening_1: + await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_1) + elif agent.status == AgentStatus.screening_2: + await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_2) From b9bc7a0d23f9a7eaa8b4934035476b6e06a1f0fd Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:31:14 -0800 Subject: [PATCH 16/20] adds RunOutcome response struc --- validator/main.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/validator/main.py b/validator/main.py index 1a542bcc5..436694b62 100644 --- a/validator/main.py +++ b/validator/main.py @@ -12,6 +12,7 @@ import utils.logger as logger import validator.config as config +from dataclasses import dataclass from typing import Any, Dict from api.endpoints.validator_models import * from models.problem import ProblemTestResultStatus @@ -39,6 +40,13 @@ sandbox_manager = None problem_suites = [] +# Result from a single evaluation run, +# Used to decide whether to cancel remaining runs +@dataclass +class RunOutcome: + solved: bool = False # Did all tests pass? + had_syntax_error: bool = False # Was there an AGENT_INVALID_PATCH error? + # Disconnect from the Ridges platform (called when the program exits) From 402ef925a09a08409cf6e77d10f0f4c0982d0dc9 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:42:56 -0800 Subject: [PATCH 17/20] add skip_evaluation_run in orchaestrator --- validator/main.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/validator/main.py b/validator/main.py index 436694b62..1c5d1c172 100644 --- a/validator/main.py +++ b/validator/main.py @@ -107,6 +107,13 @@ async def update_evaluation_run(evaluation_run_id: UUID, problem_name: str, upda ), bearer_token=session_id, quiet=2) +async def skip_evaluation_run(evaluation_run_id: UUID, problem_name: str): + logger.info(f"Skipping evaluation run {evaluation_run_id} for problem {problem_name} (early termination)...") + + await post_ridges_platform("/validator/skip-evaluation-run", ValidatorSkipEvaluationRunRequest( + evaluation_run_id=evaluation_run_id + ), bearer_token=session_id, quiet=2) + # Truncates a log if required def truncate_logs_if_required(log: str) -> str: From 9e02718564a165b5df91fb0b3337265859f69bfb Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:43:41 -0800 Subject: [PATCH 18/20] sim run eval update --- validator/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validator/main.py b/validator/main.py index 1c5d1c172..f0265568c 100644 --- a/validator/main.py +++ b/validator/main.py @@ -124,7 +124,7 @@ def truncate_logs_if_required(log: str) -> str: # Simulate a run of an evaluation run, useful for testing, set SIMULATE_EVALUATION_RUNS=True in .env -async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: str): +async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: str) -> RunOutcome: logger.info(f"Starting simulated evaluation run {evaluation_run_id} for problem {problem_name}...") @@ -158,6 +158,7 @@ async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: st logger.info(f"Finished simulated evaluation run {evaluation_run_id} for problem {problem_name}") + return RunOutcome(solved=True) From 8de715bbcbdd6683c1c2c4cd11c8f6ac33c9bb64 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 12 Feb 2026 18:47:57 -0800 Subject: [PATCH 19/20] update _run_evaluation_run - track syntax error outcome - Track pass_threshold - track tasks as they run - Stop eval if threshold to pass is impossible + syntax error penalty --- validator/main.py | 95 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/validator/main.py b/validator/main.py index f0265568c..5401afd1a 100644 --- a/validator/main.py +++ b/validator/main.py @@ -163,7 +163,7 @@ async def _simulate_run_evaluation_run(evaluation_run_id: UUID, problem_name: st # Run an evaluation run -async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_code: str): +async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_code: str) -> RunOutcome: try: # Figure out what problem suite this problem belongs to problem_suite = next((suite for suite in problem_suites if suite.has_problem_name(problem_name)), None) @@ -174,7 +174,7 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_ "error_code": EvaluationRunErrorCode.VALIDATOR_UNKNOWN_PROBLEM.value, "error_message": f"The problem '{problem_name}' was not found in any problem suite" }) - return + return RunOutcome() # Get the problem problem = problem_suite.get_problem(problem_name) @@ -183,7 +183,9 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_ logger.info(f"Starting evaluation run {evaluation_run_id} for problem {problem_name}...") - + outcome = RunOutcome() + agent_sandbox = None + eval_sandbox = None try: # Move from pending -> initializing_agent @@ -247,6 +249,19 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_ "eval_logs": truncate_logs_if_required(eval_logs) }) + outcome.solved = num_passed == len(test_results) and len(test_results) > 0 + + except asyncio.CancelledError: + logger.info(f"Evaluation run {evaluation_run_id} for problem {problem_name} cancelled; cleaning up sandboxes") + + if sandbox_manager is not None: + if agent_sandbox is not None: + await asyncio.shield(asyncio.to_thread(sandbox_manager.cleanup_sandbox, agent_sandbox)) + if eval_sandbox is not None: + await asyncio.shield(asyncio.to_thread(sandbox_manager.cleanup_sandbox, eval_sandbox)) + + raise + except EvaluationRunException as e: logger.error(f"Evaluation run {evaluation_run_id} for problem {problem_name} errored: {e}") @@ -255,6 +270,8 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_ "error_message": e.error_message }) + outcome.had_syntax_error = (e.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH) + except Exception as e: logger.error(f"Evaluation run {evaluation_run_id} for problem {problem_name} errored: {EvaluationRunErrorCode.VALIDATOR_INTERNAL_ERROR.get_error_message()}: {e}") logger.error(traceback.format_exc()) @@ -267,15 +284,23 @@ async def _run_evaluation_run(evaluation_run_id: UUID, problem_name: str, agent_ logger.info(f"Finished evaluation run {evaluation_run_id} for problem {problem_name}") + return outcome + except asyncio.CancelledError: + raise except Exception as e: logger.error(f"Error in _run_evaluation_run(): {type(e).__name__}: {e}") logger.error(traceback.format_exc()) os._exit(1) + + return RunOutcome() # Run an evaluation, automatically dispatches all runs to either _simulate_run_evaluation_run or _run_evaluation_run +# Terminate early when: +# - Threshold is impossible +# - Syntax error is detected async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluationResponse): logger.info("Received evaluation:") logger.info(f" # of evaluation runs: {len(request_evaluation_response.evaluation_runs)}") @@ -283,23 +308,75 @@ async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluatio for evaluation_run in request_evaluation_response.evaluation_runs: logger.info(f" {evaluation_run.problem_name}") - + pass_threshold = request_evaluation_response.pass_threshold + logger.info(f" Pass threshold: {pass_threshold}") logger.info("Starting evaluation...") - tasks = [] + task_to_run_info: Dict[asyncio.Task, ValidatorRequestEvaluationResponseEvaluationRun] = {} + for evaluation_run in request_evaluation_response.evaluation_runs: evaluation_run_id = evaluation_run.evaluation_run_id problem_name = evaluation_run.problem_name if config.SIMULATE_EVALUATION_RUNS: - tasks.append(asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name))) + task = asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name)) else: - tasks.append(asyncio.create_task(_run_evaluation_run(evaluation_run_id, problem_name, request_evaluation_response.agent_code))) + task = asyncio.create_task(_run_evaluation_run(evaluation_run_id, problem_name, request_evaluation_response.agent_code)) + + task_to_run_info[task] = evaluation_run + + # Process tasks as they complete (check for early termination) + total = len(task_to_run_info) + solved_count = 0 + completed_count = 0 + pending = set(task_to_run_info.keys()) + skip_reason = None + + while pending: + done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + + for task in done: + try: + outcome: RunOutcome = task.result() + completed_count += 1 + if outcome.solved: + solved_count += 1 + + # Only screeners have non-zero pass threshold + if pass_threshold is not None and skip_reason is None: + remaining = total - completed_count + + if outcome.had_syntax_error: + skip_reason = "syntax error penalty" + logger.info(f"Early termination triggered - syntax error detected, skipping remaining {len(pending)} runs") + + elif remaining > 0 and (solved_count + remaining) / total < pass_threshold: + skip_reason = "threshold impossible" + logger.info(f"Early termination triggered - threshold impossible ({solved_count + remaining}/{total} < {pass_threshold}), skipping remaining {len(pending)} runs") + + except asyncio.CancelledError: + pass + except Exception as e: + logger.error(f"Error getting result from task: {e}") + completed_count += 1 + + if skip_reason is not None and pending: + for pending_task in pending: + pending_task.cancel() + + await asyncio.wait(pending) + + for pending_task in pending: + run_info = task_to_run_info[pending_task] + try: + await skip_evaluation_run(run_info.evaluation_run_id, run_info.problem_name) + except Exception as e: + logger.error(f"Error skipping evaluation run {run_info.evaluation_run_id}: {e}") - await asyncio.gather(*tasks) + pending = set() - logger.info("Finished evaluation") + logger.info(f"Finished evaluation (solved={solved_count}/{total}, skip_reason={skip_reason})") await post_ridges_platform("/validator/finish-evaluation", ValidatorFinishEvaluationRequest(), bearer_token=session_id, quiet=1) From 5fe205894857497bc452fac8102312d49061fc00 Mon Sep 17 00:00:00 2001 From: ibraheem-latent Date: Thu, 19 Feb 2026 10:11:48 -0800 Subject: [PATCH 20/20] use semaphore --- validator/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/main.py b/validator/main.py index 64fc50be2..86512d866 100644 --- a/validator/main.py +++ b/validator/main.py @@ -328,7 +328,7 @@ async def _run_evaluation(request_evaluation_response: ValidatorRequestEvaluatio problem_name = evaluation_run.problem_name if config.SIMULATE_EVALUATION_RUNS: - task = asyncio.create_task(_simulate_run_evaluation_run(evaluation_run_id, problem_name)) + task = asyncio.create_task(_simulate_run_evaluation_run_with_semaphore(evaluation_run_id, problem_name, semaphore)) else: task = asyncio.create_task(_run_evaluation_run_with_semaphore(evaluation_run_id, problem_name, request_evaluation_response.agent_code, semaphore))