From 57e719e5b63c626a65aa501b67ad985931d07e6a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 25 Feb 2026 12:03:06 +0000 Subject: [PATCH] =?UTF-8?q?fix(orchestra):=20detect=20phantom=20PRs=20?= =?UTF-8?q?=E2=80=94=203-layer=20defense=20against=20hallucinated=20PR=20c?= =?UTF-8?q?laims?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Models (especially Grok) can claim "PR #3 created successfully" when github_create_pr actually failed with guardrail violations. This adds three layers of protection: Fix 2: Tag github_create_pr errors with unmistakable ❌ PR NOT CREATED banner + "Do NOT claim a PR was created" instruction in tool result. Fix 3: validateOrchestraResult() cross-references parsed ORCHESTRA_RESULT against all tool outputs — if failure patterns found (Destructive update blocked, INCOMPLETE REFACTOR, DATA FABRICATION, etc.) with no matching success evidence, flags as phantom PR and clears the URL. Fix 1: Post-execution PR verification via GitHub API — after all parsing, if a PR URL survives, verify it actually exists (GET /repos/.../pulls/N). Non-fatal on network errors, but catches any edge case the other layers miss. https://claude.ai/code/session_01K2mQTABDGY7DnnposPdDjw --- src/durable-objects/task-processor.ts | 56 +++++++++++++++++++-- src/openrouter/tools.ts | 7 ++- src/orchestra/orchestra.test.ts | 70 +++++++++++++++++++++++++++ src/orchestra/orchestra.ts | 54 +++++++++++++++++++++ 4 files changed, 181 insertions(+), 6 deletions(-) diff --git a/src/durable-objects/task-processor.ts b/src/durable-objects/task-processor.ts index 3483f6e08..fa597eec3 100644 --- a/src/durable-objects/task-processor.ts +++ b/src/durable-objects/task-processor.ts @@ -14,7 +14,7 @@ import { markdownToTelegramHtml } from '../utils/telegram-format'; import { extractLearning, storeLearning, storeLastTaskSummary, storeSessionSummary, type SessionSummary } from '../openrouter/learnings'; import { extractFilePaths, extractGitHubContext } from '../utils/file-path-extractor'; import { UserStorage } from '../openrouter/storage'; -import { parseOrchestraResult, storeOrchestraTask, type OrchestraTask } from '../orchestra/orchestra'; +import { parseOrchestraResult, validateOrchestraResult, storeOrchestraTask, type OrchestraTask } from '../orchestra/orchestra'; import { createAcontextClient, toOpenAIMessages } from '../acontext/client'; import { estimateTokens, compressContextBudgeted, sanitizeToolPairs } from './context-budget'; import { checkPhaseBudget, PhaseBudgetExceededError } from './phase-budget'; @@ -2178,8 +2178,16 @@ export class TaskProcessor extends DurableObject { // Orchestra result tracking: if the response contains ORCHESTRA_RESULT, update history if (this.r2 && task.result) { try { - const orchestraResult = parseOrchestraResult(task.result); - if (orchestraResult) { + const rawOrchestraResult = parseOrchestraResult(task.result); + if (rawOrchestraResult) { + // Fix 3: Cross-reference tool results — detect phantom PRs where model + // claims success but github_create_pr actually failed + const fullTaskOutput = conversationMessages + .filter(m => m.role === 'tool') + .map(m => typeof m.content === 'string' ? m.content : '') + .join('\n'); + const orchestraResult = validateOrchestraResult(rawOrchestraResult, fullTaskOutput); + // Find the orchestra task entry to update (or create a new completed entry) const systemMsg = request.messages.find(m => m.role === 'system'); const systemContent = typeof systemMsg?.content === 'string' ? systemMsg.content : ''; @@ -2207,7 +2215,10 @@ export class TaskProcessor extends DurableObject { let taskSummary = orchestraResult.summary || ''; let failureReason = ''; - if (!hasValidPr) { + if (orchestraResult.phantomPr) { + taskStatus = 'failed'; + failureReason = 'Phantom PR — model claimed PR but github_create_pr failed'; + } else if (!hasValidPr) { taskStatus = 'failed'; failureReason = 'No PR created'; } else if (hasIncompleteRefactor) { @@ -2232,6 +2243,41 @@ export class TaskProcessor extends DurableObject { taskSummary = `FAILED: ${failureReason}. ${orchestraResult.summary || ''}`.trim(); } + // Fix 1: Post-execution PR verification — if we still have a claimed PR URL, + // verify it actually exists via GitHub API (catches edge cases Fix 3 might miss) + let verifiedPrUrl = orchestraResult.prUrl; + if (taskStatus === 'completed' && orchestraResult.prUrl && request.githubToken) { + try { + // Extract PR number from URL: https://github.com/owner/repo/pull/123 + const prMatch = orchestraResult.prUrl.match(/github\.com\/([^/]+\/[^/]+)\/pull\/(\d+)/); + if (prMatch) { + const [, prRepo, prNumber] = prMatch; + const prCheckResponse = await fetch( + `https://api.github.com/repos/${prRepo}/pulls/${prNumber}`, + { + headers: { + 'User-Agent': 'MoltworkerBot/1.0', + 'Authorization': `Bearer ${request.githubToken}`, + 'Accept': 'application/vnd.github.v3+json', + }, + }, + ); + if (!prCheckResponse.ok) { + console.log(`[TaskProcessor] PR verification FAILED: ${orchestraResult.prUrl} → ${prCheckResponse.status}`); + taskStatus = 'failed'; + failureReason = `Phantom PR — claimed ${orchestraResult.prUrl} but GitHub returned ${prCheckResponse.status}`; + taskSummary = `FAILED: ${failureReason}. ${orchestraResult.summary || ''}`.trim(); + verifiedPrUrl = ''; + } else { + console.log(`[TaskProcessor] PR verification OK: ${orchestraResult.prUrl}`); + } + } + } catch (verifyErr) { + // Non-fatal — if we can't verify, keep the claimed URL + console.log(`[TaskProcessor] PR verification error (non-fatal): ${verifyErr}`); + } + } + const completedTask: OrchestraTask = { taskId: task.taskId, timestamp: Date.now(), @@ -2240,7 +2286,7 @@ export class TaskProcessor extends DurableObject { mode: orchestraMode, prompt: prompt.substring(0, 200), branchName: orchestraResult.branch, - prUrl: orchestraResult.prUrl, + prUrl: verifiedPrUrl, status: taskStatus, filesChanged: orchestraResult.files, summary: taskSummary, diff --git a/src/openrouter/tools.ts b/src/openrouter/tools.ts index 27a3dbbe7..96149dcb4 100644 --- a/src/openrouter/tools.ts +++ b/src/openrouter/tools.ts @@ -554,10 +554,15 @@ export async function executeTool(toolCall: ToolCall, context?: ToolContext): Pr content: result, }; } catch (error) { + const errMsg = error instanceof Error ? error.message : String(error); + // Make github_create_pr failures unmistakable so models can't hallucinate success + const prefix = name === 'github_create_pr' + ? `❌ PR NOT CREATED — github_create_pr FAILED.\n\nDo NOT claim a PR was created. The PR does not exist.\n\nError: ` + : `Error executing ${name}: `; return { tool_call_id: toolCall.id, role: 'tool', - content: `Error executing ${name}: ${error instanceof Error ? error.message : String(error)}`, + content: prefix + errMsg, }; } } diff --git a/src/orchestra/orchestra.test.ts b/src/orchestra/orchestra.test.ts index 0ea0be845..6d60e2405 100644 --- a/src/orchestra/orchestra.test.ts +++ b/src/orchestra/orchestra.test.ts @@ -10,6 +10,7 @@ import { buildOrchestraPrompt, parseOrchestraCommand, parseOrchestraResult, + validateOrchestraResult, generateTaskSlug, loadOrchestraHistory, storeOrchestraTask, @@ -1224,3 +1225,72 @@ describe('partial failure handling in prompts', () => { expect(prompt).toContain('partial'); }); }); + +// --- validateOrchestraResult --- + +describe('validateOrchestraResult', () => { + const baseResult = { + branch: 'bot/add-feature-grok', + prUrl: 'https://github.com/owner/repo/pull/42', + files: ['src/feature.ts'], + summary: 'Added feature', + }; + + it('passes through valid result when no failure evidence', () => { + const validated = validateOrchestraResult(baseResult, 'github_read_file returned content...'); + expect(validated.prUrl).toBe('https://github.com/owner/repo/pull/42'); + expect(validated.phantomPr).toBe(false); + }); + + it('detects phantom PR when tool output shows PR NOT CREATED', () => { + const toolOutput = '❌ PR NOT CREATED — github_create_pr FAILED.\n\nError: Destructive update blocked'; + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.prUrl).toBe(''); + expect(validated.phantomPr).toBe(true); + expect(validated.summary).toContain('PHANTOM PR'); + }); + + it('detects phantom PR when tool output shows Destructive update blocked', () => { + const toolOutput = 'Error executing github_create_pr: Destructive update blocked for "src/App.jsx"'; + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.prUrl).toBe(''); + expect(validated.phantomPr).toBe(true); + }); + + it('detects phantom PR when INCOMPLETE REFACTOR in tool output', () => { + const toolOutput = 'INCOMPLETE REFACTOR blocked: 3 new code files created but no existing code files updated.'; + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.prUrl).toBe(''); + expect(validated.phantomPr).toBe(true); + }); + + it('detects phantom PR when DATA FABRICATION in tool output', () => { + const toolOutput = 'DATA FABRICATION blocked for "src/App.jsx": only 3/20 original data values survive'; + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.prUrl).toBe(''); + expect(validated.phantomPr).toBe(true); + }); + + it('does NOT flag phantom PR when failure exists but success also confirmed', () => { + const toolOutput = [ + '❌ PR NOT CREATED — github_create_pr FAILED.\n\nError: 422 branch already exists', + '✅ Pull Request created successfully!\n\nPR: https://github.com/owner/repo/pull/42', + ].join('\n'); + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.prUrl).toBe('https://github.com/owner/repo/pull/42'); + expect(validated.phantomPr).toBe(false); + }); + + it('passes through when no PR URL claimed', () => { + const noPrResult = { ...baseResult, prUrl: '' }; + const validated = validateOrchestraResult(noPrResult, 'some tool output'); + expect(validated.phantomPr).toBe(false); + }); + + it('preserves branch and files when detecting phantom PR', () => { + const toolOutput = 'Full-rewrite blocked for "src/App.jsx"'; + const validated = validateOrchestraResult(baseResult, toolOutput); + expect(validated.branch).toBe('bot/add-feature-grok'); + expect(validated.files).toEqual(['src/feature.ts']); + }); +}); diff --git a/src/orchestra/orchestra.ts b/src/orchestra/orchestra.ts index 5828576cb..24f698b14 100644 --- a/src/orchestra/orchestra.ts +++ b/src/orchestra/orchestra.ts @@ -442,6 +442,60 @@ export function parseOrchestraResult(response: string): { return { branch, prUrl: validPrUrl, files, summary }; } +/** + * Cross-reference a parsed orchestra result against tool output evidence. + * Detects phantom PRs: model claims a PR URL but tool results show failures. + * + * @param result - Parsed orchestra result (from parseOrchestraResult) + * @param fullOutput - The full task output including tool results + * @returns Validated result with prUrl cleared if evidence contradicts the claim + */ +export function validateOrchestraResult( + result: { branch: string; prUrl: string; files: string[]; summary: string }, + fullOutput: string, +): { branch: string; prUrl: string; files: string[]; summary: string; phantomPr: boolean } { + if (!result.prUrl) { + return { ...result, phantomPr: false }; + } + + // Evidence of github_create_pr failure in tool results + const prFailurePatterns = [ + 'PR NOT CREATED', + 'github_create_pr FAILED', + 'Destructive update blocked', + 'Full-rewrite blocked', + 'INCOMPLETE REFACTOR blocked', + 'DATA FABRICATION blocked', + 'NET DELETION blocked', + 'AUDIT TRAIL VIOLATION', + 'ROADMAP TAMPERING blocked', + 'FALSE COMPLETION blocked', + 'Error executing github_create_pr', + ]; + + const hasFailureEvidence = prFailurePatterns.some(pattern => fullOutput.includes(pattern)); + + // Evidence of actual PR creation success + // The tool returns "Pull Request created successfully!" + "PR: https://github.com/..." + const hasSuccessEvidence = + fullOutput.includes('Pull Request created successfully') || + fullOutput.includes(`PR: ${result.prUrl}`) || + fullOutput.includes(`"html_url":"${result.prUrl}"`); + + // If there's failure evidence AND no success evidence, this is a phantom PR + if (hasFailureEvidence && !hasSuccessEvidence) { + console.log(`[orchestra] Phantom PR detected: model claimed ${result.prUrl} but tool results show failure`); + return { + ...result, + prUrl: '', + summary: `⚠️ PHANTOM PR: Model claimed PR but github_create_pr failed. ${result.summary}`, + phantomPr: true, + }; + } + + return { ...result, phantomPr: false }; +} + // ============================================================ // Helpers // ============================================================