Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 51 additions & 5 deletions src/durable-objects/task-processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import { markdownToTelegramHtml } from '../utils/telegram-format';
import { extractLearning, storeLearning, storeLastTaskSummary, storeSessionSummary, type SessionSummary } from '../openrouter/learnings';
import { extractFilePaths, extractGitHubContext } from '../utils/file-path-extractor';
import { UserStorage } from '../openrouter/storage';
import { parseOrchestraResult, storeOrchestraTask, type OrchestraTask } from '../orchestra/orchestra';
import { parseOrchestraResult, validateOrchestraResult, storeOrchestraTask, type OrchestraTask } from '../orchestra/orchestra';
import { createAcontextClient, toOpenAIMessages } from '../acontext/client';
import { estimateTokens, compressContextBudgeted, sanitizeToolPairs } from './context-budget';
import { checkPhaseBudget, PhaseBudgetExceededError } from './phase-budget';
Expand Down Expand Up @@ -2178,8 +2178,16 @@ export class TaskProcessor extends DurableObject<TaskProcessorEnv> {
// Orchestra result tracking: if the response contains ORCHESTRA_RESULT, update history
if (this.r2 && task.result) {
try {
const orchestraResult = parseOrchestraResult(task.result);
if (orchestraResult) {
const rawOrchestraResult = parseOrchestraResult(task.result);
if (rawOrchestraResult) {
// Fix 3: Cross-reference tool results — detect phantom PRs where model
// claims success but github_create_pr actually failed
const fullTaskOutput = conversationMessages
.filter(m => m.role === 'tool')
.map(m => typeof m.content === 'string' ? m.content : '')
.join('\n');
const orchestraResult = validateOrchestraResult(rawOrchestraResult, fullTaskOutput);

// Find the orchestra task entry to update (or create a new completed entry)
const systemMsg = request.messages.find(m => m.role === 'system');
const systemContent = typeof systemMsg?.content === 'string' ? systemMsg.content : '';
Expand Down Expand Up @@ -2207,7 +2215,10 @@ export class TaskProcessor extends DurableObject<TaskProcessorEnv> {
let taskSummary = orchestraResult.summary || '';
let failureReason = '';

if (!hasValidPr) {
if (orchestraResult.phantomPr) {
taskStatus = 'failed';
failureReason = 'Phantom PR — model claimed PR but github_create_pr failed';
} else if (!hasValidPr) {
taskStatus = 'failed';
failureReason = 'No PR created';
} else if (hasIncompleteRefactor) {
Expand All @@ -2232,6 +2243,41 @@ export class TaskProcessor extends DurableObject<TaskProcessorEnv> {
taskSummary = `FAILED: ${failureReason}. ${orchestraResult.summary || ''}`.trim();
}

// Fix 1: Post-execution PR verification — if we still have a claimed PR URL,
// verify it actually exists via GitHub API (catches edge cases Fix 3 might miss)
let verifiedPrUrl = orchestraResult.prUrl;
if (taskStatus === 'completed' && orchestraResult.prUrl && request.githubToken) {
try {
// Extract PR number from URL: https://github.com/owner/repo/pull/123
const prMatch = orchestraResult.prUrl.match(/github\.com\/([^/]+\/[^/]+)\/pull\/(\d+)/);
if (prMatch) {
const [, prRepo, prNumber] = prMatch;
const prCheckResponse = await fetch(
`https://api.github.com/repos/${prRepo}/pulls/${prNumber}`,
{
headers: {
'User-Agent': 'MoltworkerBot/1.0',
'Authorization': `Bearer ${request.githubToken}`,
'Accept': 'application/vnd.github.v3+json',
},
},
);
if (!prCheckResponse.ok) {
console.log(`[TaskProcessor] PR verification FAILED: ${orchestraResult.prUrl} → ${prCheckResponse.status}`);
taskStatus = 'failed';
failureReason = `Phantom PR — claimed ${orchestraResult.prUrl} but GitHub returned ${prCheckResponse.status}`;
taskSummary = `FAILED: ${failureReason}. ${orchestraResult.summary || ''}`.trim();
verifiedPrUrl = '';
} else {
console.log(`[TaskProcessor] PR verification OK: ${orchestraResult.prUrl}`);
}
}
} catch (verifyErr) {
// Non-fatal — if we can't verify, keep the claimed URL
console.log(`[TaskProcessor] PR verification error (non-fatal): ${verifyErr}`);
}
}

const completedTask: OrchestraTask = {
taskId: task.taskId,
timestamp: Date.now(),
Expand All @@ -2240,7 +2286,7 @@ export class TaskProcessor extends DurableObject<TaskProcessorEnv> {
mode: orchestraMode,
prompt: prompt.substring(0, 200),
branchName: orchestraResult.branch,
prUrl: orchestraResult.prUrl,
prUrl: verifiedPrUrl,
status: taskStatus,
filesChanged: orchestraResult.files,
summary: taskSummary,
Expand Down
7 changes: 6 additions & 1 deletion src/openrouter/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -554,10 +554,15 @@ export async function executeTool(toolCall: ToolCall, context?: ToolContext): Pr
content: result,
};
} catch (error) {
const errMsg = error instanceof Error ? error.message : String(error);
// Make github_create_pr failures unmistakable so models can't hallucinate success
const prefix = name === 'github_create_pr'
? `❌ PR NOT CREATED — github_create_pr FAILED.\n\nDo NOT claim a PR was created. The PR does not exist.\n\nError: `
: `Error executing ${name}: `;
return {
tool_call_id: toolCall.id,
role: 'tool',
content: `Error executing ${name}: ${error instanceof Error ? error.message : String(error)}`,
content: prefix + errMsg,
};
}
}
Expand Down
70 changes: 70 additions & 0 deletions src/orchestra/orchestra.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
buildOrchestraPrompt,
parseOrchestraCommand,
parseOrchestraResult,
validateOrchestraResult,
generateTaskSlug,
loadOrchestraHistory,
storeOrchestraTask,
Expand Down Expand Up @@ -1224,3 +1225,72 @@ describe('partial failure handling in prompts', () => {
expect(prompt).toContain('partial');
});
});

// --- validateOrchestraResult ---

describe('validateOrchestraResult', () => {
const baseResult = {
branch: 'bot/add-feature-grok',
prUrl: 'https://github.com/owner/repo/pull/42',
files: ['src/feature.ts'],
summary: 'Added feature',
};

it('passes through valid result when no failure evidence', () => {
const validated = validateOrchestraResult(baseResult, 'github_read_file returned content...');
expect(validated.prUrl).toBe('https://github.com/owner/repo/pull/42');
expect(validated.phantomPr).toBe(false);
});

it('detects phantom PR when tool output shows PR NOT CREATED', () => {
const toolOutput = '❌ PR NOT CREATED — github_create_pr FAILED.\n\nError: Destructive update blocked';
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.prUrl).toBe('');
expect(validated.phantomPr).toBe(true);
expect(validated.summary).toContain('PHANTOM PR');
});

it('detects phantom PR when tool output shows Destructive update blocked', () => {
const toolOutput = 'Error executing github_create_pr: Destructive update blocked for "src/App.jsx"';
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.prUrl).toBe('');
expect(validated.phantomPr).toBe(true);
});

it('detects phantom PR when INCOMPLETE REFACTOR in tool output', () => {
const toolOutput = 'INCOMPLETE REFACTOR blocked: 3 new code files created but no existing code files updated.';
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.prUrl).toBe('');
expect(validated.phantomPr).toBe(true);
});

it('detects phantom PR when DATA FABRICATION in tool output', () => {
const toolOutput = 'DATA FABRICATION blocked for "src/App.jsx": only 3/20 original data values survive';
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.prUrl).toBe('');
expect(validated.phantomPr).toBe(true);
});

it('does NOT flag phantom PR when failure exists but success also confirmed', () => {
const toolOutput = [
'❌ PR NOT CREATED — github_create_pr FAILED.\n\nError: 422 branch already exists',
'✅ Pull Request created successfully!\n\nPR: https://github.com/owner/repo/pull/42',
].join('\n');
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.prUrl).toBe('https://github.com/owner/repo/pull/42');
expect(validated.phantomPr).toBe(false);
});

it('passes through when no PR URL claimed', () => {
const noPrResult = { ...baseResult, prUrl: '' };
const validated = validateOrchestraResult(noPrResult, 'some tool output');
expect(validated.phantomPr).toBe(false);
});

it('preserves branch and files when detecting phantom PR', () => {
const toolOutput = 'Full-rewrite blocked for "src/App.jsx"';
const validated = validateOrchestraResult(baseResult, toolOutput);
expect(validated.branch).toBe('bot/add-feature-grok');
expect(validated.files).toEqual(['src/feature.ts']);
});
});
54 changes: 54 additions & 0 deletions src/orchestra/orchestra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,60 @@ export function parseOrchestraResult(response: string): {
return { branch, prUrl: validPrUrl, files, summary };
}

/**
* Cross-reference a parsed orchestra result against tool output evidence.
* Detects phantom PRs: model claims a PR URL but tool results show failures.
*
* @param result - Parsed orchestra result (from parseOrchestraResult)
* @param fullOutput - The full task output including tool results
* @returns Validated result with prUrl cleared if evidence contradicts the claim
*/
export function validateOrchestraResult(
result: { branch: string; prUrl: string; files: string[]; summary: string },
fullOutput: string,
): { branch: string; prUrl: string; files: string[]; summary: string; phantomPr: boolean } {
if (!result.prUrl) {
return { ...result, phantomPr: false };
}

// Evidence of github_create_pr failure in tool results
const prFailurePatterns = [
'PR NOT CREATED',
'github_create_pr FAILED',
'Destructive update blocked',
'Full-rewrite blocked',
'INCOMPLETE REFACTOR blocked',
'DATA FABRICATION blocked',
'NET DELETION blocked',
'AUDIT TRAIL VIOLATION',
'ROADMAP TAMPERING blocked',
'FALSE COMPLETION blocked',
'Error executing github_create_pr',
];

const hasFailureEvidence = prFailurePatterns.some(pattern => fullOutput.includes(pattern));

// Evidence of actual PR creation success
// The tool returns "Pull Request created successfully!" + "PR: https://github.com/..."
const hasSuccessEvidence =
fullOutput.includes('Pull Request created successfully') ||
fullOutput.includes(`PR: ${result.prUrl}`) ||
fullOutput.includes(`"html_url":"${result.prUrl}"`);

// If there's failure evidence AND no success evidence, this is a phantom PR
if (hasFailureEvidence && !hasSuccessEvidence) {
console.log(`[orchestra] Phantom PR detected: model claimed ${result.prUrl} but tool results show failure`);
return {
...result,
prUrl: '',
summary: `⚠️ PHANTOM PR: Model claimed PR but github_create_pr failed. ${result.summary}`,
phantomPr: true,
};
}

return { ...result, phantomPr: false };
}

// ============================================================
// Helpers
// ============================================================
Expand Down
Loading