Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6dba8b0
validate_patched_files_syntax
ibraheem-abe Feb 13, 2026
3e88146
syntax check in polyglot suite
ibraheem-abe Feb 13, 2026
f5d004f
syntax check in swebench
ibraheem-abe Feb 13, 2026
bc7f837
add dev validator
ibraheem-abe Feb 13, 2026
9d59eee
add skipped status
ibraheem-abe Feb 13, 2026
e2c4b15
cleanup_sandbox
ibraheem-abe Feb 13, 2026
e740523
update EvaluationRunStatus with skipped enum
ibraheem-abe Feb 13, 2026
96a42b5
handle skipped status for existing dbs
ibraheem-abe Feb 13, 2026
28cb5cf
update evaluatiuons_hydrated view
ibraheem-abe Feb 13, 2026
a042097
add status evaluation logic in comments
ibraheem-abe Feb 13, 2026
1795b6e
ValidatorRequestEvaluationResponse now has pass_threshold for screeners
ibraheem-abe Feb 13, 2026
de02f7e
validator_request_evaluation should pass threshold to screeners
ibraheem-abe Feb 13, 2026
d3838d9
add /skip-evaluation-run endpoint
ibraheem-abe Feb 13, 2026
df3d881
handle skipped in /finish-evaluation
ibraheem-abe Feb 13, 2026
5b6dff5
update handle_evaluation_if_finished
ibraheem-abe Feb 13, 2026
b9bc7a0
adds RunOutcome response struc
ibraheem-abe Feb 13, 2026
402ef92
add skip_evaluation_run in orchaestrator
ibraheem-abe Feb 13, 2026
9e02718
sim run eval update
ibraheem-abe Feb 13, 2026
8de715b
update _run_evaluation_run
ibraheem-abe Feb 13, 2026
33a6ff8
Merge branch 'main' into feat/bad-syntax-cancels-eval
camfairchild Feb 18, 2026
97e07c4
Merge branch 'main' into feat/bad-syntax-cancels-eval
camfairchild Feb 18, 2026
08a9a7f
Merge branch 'main' into feat/bad-syntax-cancels-eval
ibraheem-abe Feb 19, 2026
5fe2058
use semaphore
ibraheem-abe Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 75 additions & 5 deletions api/endpoints/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
get_all_evaluation_runs_in_evaluation_id, create_evaluation_run_log, check_if_evaluation_run_logs_exist
from models.agent import Agent, AgentStatus
from models.evaluation import Evaluation, EvaluationStatus
from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType
from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType, EvaluationRunErrorCode
from models.evaluation_set import EvaluationSetGroup
from models.problem import ProblemTestResult
from utils.bittensor import validate_signed_timestamp
from utils.s3 import download_text_file_from_s3
Expand Down Expand Up @@ -355,7 +356,14 @@ async def validator_request_evaluation(
agent_code = await download_text_file_from_s3(f"{agent_id}/agent.py")
evaluation_runs = [ValidatorRequestEvaluationResponseEvaluationRun(evaluation_run_id=evaluation_run.evaluation_run_id, problem_name=evaluation_run.problem_name) for evaluation_run in evaluation_runs]

return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs)
# Determine pass threshold for screeners (None for validators)
pass_threshold = None
if validator.current_agent.status == AgentStatus.screening_1:
pass_threshold = config.SCREENER_1_THRESHOLD
elif validator.current_agent.status == AgentStatus.screening_2:
pass_threshold = config.SCREENER_2_THRESHOLD

return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs, pass_threshold=pass_threshold)

def record_validator_heartbeat(validator: Validator, system_metrics: SystemMetrics | None = None) -> None:
validator.time_last_heartbeat = datetime.now(timezone.utc)
Expand Down Expand Up @@ -615,6 +623,52 @@ async def validator_disconnect(



# /validator/skip-evaluation-run
# Used to mark an evaluation run as skipped when a screener (validator) cancels the evaluation
@router.post("/skip-evaluation-run")
@handle_validator_http_exceptions
async def validator_skip_evaluation_run(
request: ValidatorSkipEvaluationRunRequest,
validator: Validator = Depends(get_request_validator_with_lock)
) -> ValidatorSkipEvaluationRunResponse:
"""Mark an evaluation run as skipped (early termination)."""

if validator.current_evaluation_id is None:
raise HTTPException(
status_code=409,
detail="This validator is not currently running an evaluation, and therefore cannot skip an evaluation run."
)

evaluation_run = await get_evaluation_run_by_id(request.evaluation_run_id)

if evaluation_run is None:
raise HTTPException(
status_code=404,
detail=f"Evaluation run with ID {request.evaluation_run_id} does not exist."
)

if evaluation_run.evaluation_id != validator.current_evaluation_id:
raise HTTPException(
status_code=403,
detail=f"The evaluation run with ID {request.evaluation_run_id} is not associated with the validator's current evaluation."
)

# Ensure evaluation is not terminal
if evaluation_run.status in (EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped):
logger.info(f"Validator '{validator.name}' skip-evaluation-run called on terminal run (status={evaluation_run.status})")
return ValidatorSkipEvaluationRunResponse()

evaluation_run.status = EvaluationRunStatus.skipped
evaluation_run.finished_or_errored_at = datetime.now(timezone.utc)
await update_evaluation_run_by_id(evaluation_run)

logger.info(f"Validator '{validator.name}' skipped an evaluation run")
logger.info(f" Evaluation run ID: {request.evaluation_run_id}")

return ValidatorSkipEvaluationRunResponse()



# /validator/finish-evaluation
@router.post("/finish-evaluation")
@handle_validator_http_exceptions
Expand All @@ -630,15 +684,16 @@ async def validator_finish_evaluation(
detail="This validator is not currently running an evaluation, and therefore cannot request to finish an evaluation."
)

# Make sure that all evaluation runs have either finished,errored, or skipped
# Record a heartbeat for the validator
record_validator_heartbeat(validator)

# Make sure that all evaluation runs have either finished or errored
evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(validator.current_evaluation_id)
if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error] for evaluation_run in evaluation_runs):
if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped] for evaluation_run in evaluation_runs):
raise HTTPException(
status_code=409,
detail="Not all evaluation runs associated with the evaluation that this validator is currently running have either finished or errored. Did you forget to send an update-evaluation-run?"
detail="Not all evaluation runs associated with the evaluation that this validator is currently running have finished, errored, or been skipped. Did you forget to send an update-evaluation-run?"
)


Expand Down Expand Up @@ -731,4 +786,19 @@ async def handle_evaluation_if_finished(evaluation_id: UUID) -> None:
# raise ValueError(f"Invalid agent status: {agent.status}, this should never happen")
return

await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)
await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)

elif hydrated_evaluation.status == EvaluationStatus.failure:
if hydrated_evaluation.evaluation_set_group in (EvaluationSetGroup.screener_1, EvaluationSetGroup.screener_2):
evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(evaluation_id)
has_syntax_penalty = any(
run.error_code is not None and run.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH
for run in evaluation_runs
)

if has_syntax_penalty:
agent = await get_agent_by_id(hydrated_evaluation.agent_id)
if agent.status == AgentStatus.screening_1:
await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_1)
elif agent.status == AgentStatus.screening_2:
await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_2)
8 changes: 8 additions & 0 deletions api/endpoints/validator_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class ValidatorRequestEvaluationResponseEvaluationRun(BaseModel): # :(
class ValidatorRequestEvaluationResponse(BaseModel):
agent_code: str
evaluation_runs: List[ValidatorRequestEvaluationResponseEvaluationRun]
pass_threshold: Optional[float] = None # None for validators



Expand Down Expand Up @@ -74,6 +75,13 @@ class ValidatorUpdateEvaluationRunRequest(BaseModel):
class ValidatorUpdateEvaluationRunResponse(BaseModel):
pass

# Models for new endpoint that mark a run as skipped when screener (vali) cancels
class ValidatorSkipEvaluationRunRequest(BaseModel):
evaluation_run_id: UUID

class ValidatorSkipEvaluationRunResponse(BaseModel):
pass



class ValidatorDisconnectRequest(BaseModel):
Expand Down
26 changes: 23 additions & 3 deletions api/src/backend/postgres_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ BEGIN
'initializing_eval',
'running_eval',
'finished',
'error'
'error',
'skipped',
);
END IF;

Expand Down Expand Up @@ -224,12 +225,20 @@ FROM evaluation_runs;

-- Second view: Evaluations hydrated view
-- Evaluations with aggregated status and average score
--
-- Status logic:
-- 1. 'failure' — Syntax penalty: screener evaluation where any run hit AGENT_INVALID_PATCH (1040). Agent is penalized.
-- 2. 'success' — Clean completion: every run finished, was skipped, or errored with an agent-level error (1000-1999).
-- The evaluation infra worked; score the agent normally.
-- 3. 'failure' — Infra failure: all runs are done but at least one had a non-agent error (2000+). Re-queue.
-- 4. 'running' — Some runs are still in progress.
CREATE OR REPLACE VIEW evaluations_hydrated AS
SELECT
evaluations.*,
(CASE
WHEN EVERY(erh.status = 'finished' OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
WHEN EVERY(erh.status IN ('finished', 'error')) THEN 'failure'
WHEN evaluations.evaluation_set_group IN ('screener_1', 'screener_2') AND bool_or(erh.error_code = 1040) THEN 'failure'
WHEN EVERY(erh.status IN ('finished', 'skipped') OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
WHEN EVERY(erh.status IN ('finished', 'error', 'skipped')) THEN 'failure'
ELSE 'running'
END)::EvaluationStatus AS status,
COUNT(*) FILTER (WHERE erh.solved)::float / COUNT(*) AS score
Expand Down Expand Up @@ -531,3 +540,14 @@ CREATE TRIGGER tr_refresh_agent_scores_unapproved_agent_ids
AFTER INSERT OR UPDATE OR DELETE
ON unapproved_agent_ids FOR EACH ROW
EXECUTE PROCEDURE refresh_agent_scores();

DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_enum
WHERE enumlabel = 'skipped'
AND enumtypid = (SELECT oid FROM pg_type WHERE typname = 'evaluationrunstatus')
) THEN
ALTER TYPE EvaluationRunStatus ADD VALUE 'skipped';
END IF;
END $$;
10 changes: 8 additions & 2 deletions evaluator/problem_suites/polyglot/polyglot_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from utils.git import init_local_repo_with_initial_commit
from evaluator.sandbox.sandbox_manager import SandboxManager
from evaluator.problem_suites.problem_suite import ProblemSuite, ProblemSuiteName
from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo
from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo, validate_patched_files_syntax



Expand Down Expand Up @@ -147,7 +147,13 @@ def _on_mount(temp_dir: str):
# Apply the patch
apply_diff_to_local_repo(patch, sandbox_repo_dir)


# Syntax-check the patched files
is_valid, error_message = validate_patched_files_syntax(sandbox_repo_dir)
if not is_valid:
raise EvaluationRunException(
EvaluationRunErrorCode.AGENT_INVALID_PATCH,
f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
)

return sandbox_manager.initialize_sandbox(
name=f"eval-sandbox-{problem.name}-{evaluation_run_id}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pydantic import BaseModel
from utils.docker import get_docker_client
from typing import Any, Dict, List, Tuple, Optional
from utils.diff import validate_diff_for_local_repo
from utils.diff import validate_diff_for_local_repo, apply_diff_to_local_repo, validate_patched_files_syntax
from evaluator.models import EvaluationRunException
from swebench.harness.constants import SWEbenchInstance
from utils.temp import create_temp_dir, delete_temp_dir
Expand Down Expand Up @@ -186,7 +186,14 @@ def initialize_eval_sandbox(
f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
)


# Syntax-check the patched files
apply_diff_to_local_repo(patch, temp_dir)
is_valid, error_message = validate_patched_files_syntax(temp_dir)
if not is_valid:
raise EvaluationRunException(
EvaluationRunErrorCode.AGENT_INVALID_PATCH,
f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
)

swebench_instance = problem.userdata

Expand Down
14 changes: 14 additions & 0 deletions evaluator/sandbox/sandbox_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,20 @@ def initialize_sandbox(



def cleanup_sandbox(self, sandbox: Sandbox):
"""Clean up a sandbox's container and temp directory."""
try:
sandbox.container.stop()
sandbox.container.remove()
except Exception:
pass
try:
delete_temp_dir(sandbox.temp_dir)
except Exception:
pass



def run_sandbox(
self,
sandbox: Sandbox
Expand Down
1 change: 1 addition & 0 deletions models/evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class EvaluationRunStatus(str, Enum):
running_eval = 'running_eval'
finished = 'finished'
error = 'error'
skipped = 'skipped'



Expand Down
Loading