ridgesai · ibraheem-abe · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/api/endpoints/validator.py b/api/endpoints/validator.py
@@ -22,7 +22,8 @@
     get_all_evaluation_runs_in_evaluation_id, create_evaluation_run_log, check_if_evaluation_run_logs_exist
 from models.agent import Agent, AgentStatus
 from models.evaluation import Evaluation, EvaluationStatus
-from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType
+from models.evaluation_run import EvaluationRunStatus, EvaluationRunLogType, EvaluationRunErrorCode
+from models.evaluation_set import EvaluationSetGroup
 from models.problem import ProblemTestResult
 from utils.bittensor import validate_signed_timestamp
 from utils.s3 import download_text_file_from_s3
@@ -355,7 +356,14 @@ async def validator_request_evaluation(
     agent_code = await download_text_file_from_s3(f"{agent_id}/agent.py")
     evaluation_runs = [ValidatorRequestEvaluationResponseEvaluationRun(evaluation_run_id=evaluation_run.evaluation_run_id, problem_name=evaluation_run.problem_name) for evaluation_run in evaluation_runs]
 
-    return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs)
+    # Determine pass threshold for screeners (None for validators)
+    pass_threshold = None
+    if validator.current_agent.status == AgentStatus.screening_1:
+        pass_threshold = config.SCREENER_1_THRESHOLD
+    elif validator.current_agent.status == AgentStatus.screening_2:
+        pass_threshold = config.SCREENER_2_THRESHOLD
+
+    return ValidatorRequestEvaluationResponse(agent_code=agent_code, evaluation_runs=evaluation_runs, pass_threshold=pass_threshold)
 
 def record_validator_heartbeat(validator: Validator, system_metrics: SystemMetrics | None = None) -> None:
     validator.time_last_heartbeat = datetime.now(timezone.utc)
@@ -615,6 +623,52 @@ async def validator_disconnect(
 
 
 
+# /validator/skip-evaluation-run
+# Used to mark an evaluation run as skipped when a screener (validator) cancels the evaluation
+@router.post("/skip-evaluation-run")
+@handle_validator_http_exceptions
+async def validator_skip_evaluation_run(
+    request: ValidatorSkipEvaluationRunRequest,
+    validator: Validator = Depends(get_request_validator_with_lock)
+) -> ValidatorSkipEvaluationRunResponse:
+    """Mark an evaluation run as skipped (early termination)."""
+
+    if validator.current_evaluation_id is None:
+        raise HTTPException(
+            status_code=409,
+            detail="This validator is not currently running an evaluation, and therefore cannot skip an evaluation run."
+        )
+
+    evaluation_run = await get_evaluation_run_by_id(request.evaluation_run_id)
+
+    if evaluation_run is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Evaluation run with ID {request.evaluation_run_id} does not exist."
+        )
+
+    if evaluation_run.evaluation_id != validator.current_evaluation_id:
+        raise HTTPException(
+            status_code=403,
+            detail=f"The evaluation run with ID {request.evaluation_run_id} is not associated with the validator's current evaluation."
+        )
+
+    # Ensure evaluation is not terminal
+    if evaluation_run.status in (EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped):
+        logger.info(f"Validator '{validator.name}' skip-evaluation-run called on terminal run (status={evaluation_run.status})")
+        return ValidatorSkipEvaluationRunResponse()
+
+    evaluation_run.status = EvaluationRunStatus.skipped
+    evaluation_run.finished_or_errored_at = datetime.now(timezone.utc)
+    await update_evaluation_run_by_id(evaluation_run)
+
+    logger.info(f"Validator '{validator.name}' skipped an evaluation run")
+    logger.info(f"  Evaluation run ID: {request.evaluation_run_id}")
+
+    return ValidatorSkipEvaluationRunResponse()
+
+
+
 # /validator/finish-evaluation
 @router.post("/finish-evaluation")
 @handle_validator_http_exceptions
@@ -630,15 +684,16 @@ async def validator_finish_evaluation(
             detail="This validator is not currently running an evaluation, and therefore cannot request to finish an evaluation."
         )
 
+    # Make sure that all evaluation runs have either finished,errored, or skipped
     # Record a heartbeat for the validator
     record_validator_heartbeat(validator)
 
     # Make sure that all evaluation runs have either finished or errored
     evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(validator.current_evaluation_id)
-    if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error] for evaluation_run in evaluation_runs):
+    if any(evaluation_run.status not in [EvaluationRunStatus.finished, EvaluationRunStatus.error, EvaluationRunStatus.skipped] for evaluation_run in evaluation_runs):
         raise HTTPException(
             status_code=409,
-            detail="Not all evaluation runs associated with the evaluation that this validator is currently running have either finished or errored. Did you forget to send an update-evaluation-run?"
+            detail="Not all evaluation runs associated with the evaluation that this validator is currently running have finished, errored, or been skipped. Did you forget to send an update-evaluation-run?"
         )
 
 
@@ -731,4 +786,19 @@ async def handle_evaluation_if_finished(evaluation_id: UUID) -> None:
                 # raise ValueError(f"Invalid agent status: {agent.status}, this should never happen")
                 return
 
-        await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)
+        await update_agent_status(hydrated_evaluation.agent_id, new_agent_status)
+
+    elif hydrated_evaluation.status == EvaluationStatus.failure:
+        if hydrated_evaluation.evaluation_set_group in (EvaluationSetGroup.screener_1, EvaluationSetGroup.screener_2):
+            evaluation_runs = await get_all_evaluation_runs_in_evaluation_id(evaluation_id)
+            has_syntax_penalty = any(
+                run.error_code is not None and run.error_code == EvaluationRunErrorCode.AGENT_INVALID_PATCH
+                for run in evaluation_runs
+            )
+
+            if has_syntax_penalty:
+                agent = await get_agent_by_id(hydrated_evaluation.agent_id)
+                if agent.status == AgentStatus.screening_1:
+                    await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_1)
+                elif agent.status == AgentStatus.screening_2:
+                    await update_agent_status(hydrated_evaluation.agent_id, AgentStatus.failed_screening_2)
diff --git a/api/endpoints/validator_models.py b/api/endpoints/validator_models.py
@@ -47,6 +47,7 @@ class ValidatorRequestEvaluationResponseEvaluationRun(BaseModel): # :(
 class ValidatorRequestEvaluationResponse(BaseModel):
     agent_code: str
     evaluation_runs: List[ValidatorRequestEvaluationResponseEvaluationRun]
+    pass_threshold: Optional[float] = None  # None for validators
 
 
 
@@ -74,6 +75,13 @@ class ValidatorUpdateEvaluationRunRequest(BaseModel):
 class ValidatorUpdateEvaluationRunResponse(BaseModel):
     pass
 
+# Models for new endpoint that mark a run as skipped when screener (vali) cancels
+class ValidatorSkipEvaluationRunRequest(BaseModel):
+    evaluation_run_id: UUID
+
+class ValidatorSkipEvaluationRunResponse(BaseModel):
+    pass
+
 
 
 class ValidatorDisconnectRequest(BaseModel):

diff --git a/api/src/backend/postgres_schema.sql b/api/src/backend/postgres_schema.sql
@@ -27,7 +27,8 @@ BEGIN
             'initializing_eval',
             'running_eval',
             'finished',
-            'error'
+            'error',
+            'skipped',
         );
     END IF;
 
@@ -224,12 +225,20 @@ FROM evaluation_runs;
 
 -- Second view: Evaluations hydrated view
 -- Evaluations with aggregated status and average score
+--
+-- Status logic:
+--   1. 'failure' — Syntax penalty: screener evaluation where any run hit AGENT_INVALID_PATCH (1040). Agent is penalized.
+--   2. 'success' — Clean completion: every run finished, was skipped, or errored with an agent-level error (1000-1999).
+--                   The evaluation infra worked; score the agent normally.
+--   3. 'failure' — Infra failure: all runs are done but at least one had a non-agent error (2000+). Re-queue.
+--   4. 'running' — Some runs are still in progress.
 CREATE OR REPLACE VIEW evaluations_hydrated AS
 SELECT
     evaluations.*,
     (CASE
-         WHEN EVERY(erh.status = 'finished' OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
-         WHEN EVERY(erh.status IN ('finished', 'error')) THEN 'failure'
+         WHEN evaluations.evaluation_set_group IN ('screener_1', 'screener_2') AND bool_or(erh.error_code = 1040) THEN 'failure'
+         WHEN EVERY(erh.status IN ('finished', 'skipped') OR (erh.status = 'error' AND erh.error_code BETWEEN 1000 AND 1999)) THEN 'success'
+         WHEN EVERY(erh.status IN ('finished', 'error', 'skipped')) THEN 'failure'
          ELSE 'running'
         END)::EvaluationStatus AS status,
     COUNT(*) FILTER (WHERE erh.solved)::float / COUNT(*) AS score
@@ -531,3 +540,14 @@ CREATE TRIGGER tr_refresh_agent_scores_unapproved_agent_ids
 AFTER INSERT OR UPDATE OR DELETE
 ON unapproved_agent_ids FOR EACH ROW
 EXECUTE PROCEDURE refresh_agent_scores();
+
+DO $$
+BEGIN
+    IF NOT EXISTS (
+        SELECT 1 FROM pg_enum
+        WHERE enumlabel = 'skipped'
+        AND enumtypid = (SELECT oid FROM pg_type WHERE typname = 'evaluationrunstatus')
+    ) THEN
+        ALTER TYPE EvaluationRunStatus ADD VALUE 'skipped';
+    END IF;
+END $$;
diff --git a/evaluator/problem_suites/polyglot/polyglot_suite.py b/evaluator/problem_suites/polyglot/polyglot_suite.py
@@ -18,7 +18,7 @@
 from utils.git import init_local_repo_with_initial_commit
 from evaluator.sandbox.sandbox_manager import SandboxManager
 from evaluator.problem_suites.problem_suite import ProblemSuite, ProblemSuiteName
-from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo
+from utils.diff import get_file_diff, apply_diff_to_local_repo, validate_diff_for_local_repo, validate_patched_files_syntax
 
 
 
@@ -147,7 +147,13 @@ def _on_mount(temp_dir: str):
                 # Apply the patch
                 apply_diff_to_local_repo(patch, sandbox_repo_dir)
 
-
+                # Syntax-check the patched files
+                is_valid, error_message = validate_patched_files_syntax(sandbox_repo_dir)
+                if not is_valid:
+                    raise EvaluationRunException(
+                        EvaluationRunErrorCode.AGENT_INVALID_PATCH,
+                        f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
+                    )
 
             return sandbox_manager.initialize_sandbox(
                 name=f"eval-sandbox-{problem.name}-{evaluation_run_id}",

diff --git a/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py b/evaluator/problem_suites/swebench_verified/swebench_verified_suite.py
@@ -11,7 +11,7 @@
 from pydantic import BaseModel
 from utils.docker import get_docker_client
 from typing import Any, Dict, List, Tuple, Optional
-from utils.diff import validate_diff_for_local_repo
+from utils.diff import validate_diff_for_local_repo, apply_diff_to_local_repo, validate_patched_files_syntax
 from evaluator.models import EvaluationRunException
 from swebench.harness.constants import SWEbenchInstance
 from utils.temp import create_temp_dir, delete_temp_dir
@@ -186,7 +186,14 @@ def initialize_eval_sandbox(
                     f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
                 )
 
-
+            # Syntax-check the patched files
+            apply_diff_to_local_repo(patch, temp_dir)
+            is_valid, error_message = validate_patched_files_syntax(temp_dir)
+            if not is_valid:
+                raise EvaluationRunException(
+                    EvaluationRunErrorCode.AGENT_INVALID_PATCH,
+                    f"{EvaluationRunErrorCode.AGENT_INVALID_PATCH.get_error_message()}: {error_message}"
+                )
 
             swebench_instance = problem.userdata
 

diff --git a/evaluator/sandbox/sandbox_manager.py b/evaluator/sandbox/sandbox_manager.py
@@ -160,6 +160,20 @@ def initialize_sandbox(
 
 
 
+    def cleanup_sandbox(self, sandbox: Sandbox):
+        """Clean up a sandbox's container and temp directory."""
+        try:
+            sandbox.container.stop()
+            sandbox.container.remove()
+        except Exception:
+            pass
+        try:
+            delete_temp_dir(sandbox.temp_dir)
+        except Exception:
+            pass
+
+
+
     def run_sandbox(
         self,
         sandbox: Sandbox

diff --git a/models/evaluation_run.py b/models/evaluation_run.py
@@ -60,6 +60,7 @@ class EvaluationRunStatus(str, Enum):
     running_eval = 'running_eval'
     finished = 'finished'
     error = 'error'
+    skipped = 'skipped'