Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/cli/src/commands/eval/junit-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ export class JunitWriter {
const errors = results.filter((r) => r.error !== undefined).length;

const testCases = results.map((r) => {
const time = r.trace?.durationMs ? (r.trace.durationMs / 1000).toFixed(3) : '0.000';
const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000';

let inner = '';
if (r.error) {
Expand Down
15 changes: 14 additions & 1 deletion apps/cli/src/commands/trace/score.ts
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,13 @@ async function runScore(
now: new Date(),
output: Array.isArray(output) ? output : undefined,
trace,
tokenUsage: raw.token_usage
? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage'])
: undefined,
costUsd: raw.cost_usd,
durationMs: raw.duration_ms,
startTime: raw.start_time,
endTime: raw.end_time,
};

const score = await evaluator.evaluate(evalContext);
Expand Down Expand Up @@ -350,7 +357,13 @@ export const traceScoreCommand = command({
evaluatorConfig.type,
);
if (traceRequired) {
const hasTrace = results.some((r) => r.trace);
const hasTrace = results.some(
(r) =>
r.trace ||
r.cost_usd !== undefined ||
r.duration_ms !== undefined ||
r.token_usage !== undefined,
);
if (!hasTrace) {
console.error(
`${c.red}Error:${c.reset} Result file lacks trace data. Re-run eval with ${c.bold}--trace${c.reset} to capture trace summaries.`,
Expand Down
36 changes: 18 additions & 18 deletions apps/cli/src/commands/trace/show.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts';
import {
type RawResult,
type RawTraceSummary,
c,
formatCost,
formatDuration,
Expand All @@ -13,31 +12,32 @@ import {
/**
* Render flat trace summary line (fallback when full output messages not available).
*/
function renderFlatTrace(trace: RawTraceSummary): string {
function renderFlatTrace(result: RawResult): string {
const trace = result.trace;
const parts: string[] = [];

if (trace.tool_names && trace.tool_names.length > 0) {
if (trace?.tool_names && trace.tool_names.length > 0) {
const toolParts = trace.tool_names.map((name) => {
const count = trace.tool_calls_by_name?.[name] ?? 0;
return count > 1 ? `${name} ×${count}` : name;
});
parts.push(`Tools: ${toolParts.join(', ')}`);
}

if (trace.duration_ms !== undefined) {
parts.push(`Duration: ${formatDuration(trace.duration_ms)}`);
if (result.duration_ms !== undefined) {
parts.push(`Duration: ${formatDuration(result.duration_ms)}`);
}

if (trace.token_usage) {
const total = trace.token_usage.input + trace.token_usage.output;
if (result.token_usage) {
const total = result.token_usage.input + result.token_usage.output;
parts.push(`Tokens: ${formatNumber(total)}`);
}

if (trace.cost_usd !== undefined) {
parts.push(`Cost: ${formatCost(trace.cost_usd)}`);
if (result.cost_usd !== undefined) {
parts.push(`Cost: ${formatCost(result.cost_usd)}`);
}

if (trace.llm_call_count !== undefined) {
if (trace?.llm_call_count !== undefined) {
parts.push(`LLM calls: ${trace.llm_call_count}`);
}

Expand Down Expand Up @@ -85,8 +85,8 @@ function renderTree(result: RawResult): string {

if (!messages || messages.length === 0) {
// Fallback to flat summary
if (result.trace) {
return renderFlatTrace(result.trace);
if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) {
return renderFlatTrace(result);
}
return `${c.dim}No trace data available${c.reset}`;
}
Expand All @@ -95,14 +95,14 @@ function renderTree(result: RawResult): string {
const testId = result.test_id ?? result.eval_id ?? 'unknown';

// Root node: test execution
const totalDuration = result.trace?.duration_ms;
const totalTokens = result.trace?.token_usage
? result.trace.token_usage.input + result.trace.token_usage.output
const totalDuration = result.duration_ms;
const totalTokens = result.token_usage
? result.token_usage.input + result.token_usage.output
: undefined;
const rootParts: string[] = [testId];
if (totalDuration !== undefined) rootParts.push(formatDuration(totalDuration));
if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`);
if (result.trace?.cost_usd !== undefined) rootParts.push(formatCost(result.trace.cost_usd));
if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd));
lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`);

// Filter to meaningful messages (assistant with tool calls, or assistant responses)
Expand Down Expand Up @@ -204,8 +204,8 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
lines.push(` ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
}

if (result.trace) {
lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result.trace)}`);
if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) {
lines.push(` ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`);
}

if (result.reasoning) {
Expand Down
10 changes: 4 additions & 6 deletions apps/cli/src/commands/trace/stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ function collectMetrics(results: RawResult[]): MetricRow[] {
}

// Latency
const latencies = results
.map((r) => r.trace?.duration_ms)
.filter((v): v is number => v !== undefined);
const latencies = results.map((r) => r.duration_ms).filter((v): v is number => v !== undefined);
if (latencies.length > 0) {
rows.push({
name: 'latency_s',
Expand All @@ -55,16 +53,16 @@ function collectMetrics(results: RawResult[]): MetricRow[] {
}

// Cost
const costs = results.map((r) => r.trace?.cost_usd).filter((v): v is number => v !== undefined);
const costs = results.map((r) => r.cost_usd).filter((v): v is number => v !== undefined);
if (costs.length > 0) {
rows.push({ name: 'cost_usd', values: costs, formatter: (n) => formatCost(n) });
}

// Total tokens
const tokens = results
.map((r) => {
if (!r.trace?.token_usage) return undefined;
return r.trace.token_usage.input + r.trace.token_usage.output;
if (!r.token_usage) return undefined;
return r.token_usage.input + r.token_usage.output;
})
.filter((v): v is number => v !== undefined);
if (tokens.length > 0) {
Expand Down
11 changes: 6 additions & 5 deletions apps/cli/src/commands/trace/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ export interface RawResult {
error?: string;
scores?: RawEvaluatorScore[];
trace?: RawTraceSummary;
// Promoted execution metrics (snake_case from JSONL)
token_usage?: { input: number; output: number; cached?: number };
cost_usd?: number;
duration_ms?: number;
start_time?: string;
end_time?: string;
input?: unknown;
output?: unknown;
trials?: unknown[];
Expand All @@ -75,12 +81,7 @@ export interface RawTraceSummary {
tool_names?: string[];
tool_calls_by_name?: Record<string, number>;
error_count?: number;
token_usage?: { input: number; output: number; cached?: number };
cost_usd?: number;
duration_ms?: number;
tool_durations?: Record<string, number[]>;
start_time?: string;
end_time?: string;
llm_call_count?: number;
}

Expand Down
8 changes: 4 additions & 4 deletions apps/cli/test/commands/trace/trace.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ const RESULT_WITH_TRACE = JSON.stringify({
tool_names: ['read', 'write'],
tool_calls_by_name: { read: 3, write: 2 },
error_count: 0,
token_usage: { input: 1000, output: 500 },
cost_usd: 0.05,
duration_ms: 3200,
llm_call_count: 2,
},
token_usage: { input: 1000, output: 500 },
cost_usd: 0.05,
duration_ms: 3200,
});

const RESULT_WITHOUT_TRACE = JSON.stringify({
Expand Down Expand Up @@ -79,7 +79,7 @@ describe('trace utils', () => {
expect(results[0].score).toBe(1);
expect(results[0].trace).toBeDefined();
expect(results[0].trace?.event_count).toBe(5);
expect(results[0].trace?.cost_usd).toBe(0.05);
expect(results[0].cost_usd).toBe(0.05);

expect(results[1].test_id).toBe('test-2');
expect(results[1].score).toBe(0.75);
Expand Down
19 changes: 7 additions & 12 deletions apps/web/src/content/docs/evaluators/code-judges.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,12 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
| `input` | `Message[]` | Full resolved input message array |
| `expected_output` | `Message[]` | Expected agent behavior including tool calls |
| `output` | `Message[]` | Actual agent execution trace with tool calls |
| `trace` | `TraceSummary` | Lightweight execution metrics |
| `trace` | `TraceSummary` | Lightweight execution metrics (tool calls, errors) |
| `token_usage` | `{input, output}` | Token consumption |
| `cost_usd` | `number` | Estimated cost in USD |
| `duration_ms` | `number` | Total execution duration |
| `start_time` | `string` | ISO timestamp of first event |
| `end_time` | `string` | ISO timestamp of last event |
| `file_changes` | `string \| null` | Unified diff of workspace file changes (when `workspace_template` is configured) |
| `workspace_path` | `string \| null` | Absolute path to the workspace directory (when `workspace_template` is configured) |

Expand All @@ -201,12 +206,7 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
"tool_names": ["fetch", "search"],
"tool_calls_by_name": { "search": 2, "fetch": 1 },
"error_count": 0,
"llm_call_count": 2,
"token_usage": { "input": 1000, "output": 500 },
"cost_usd": 0.0015,
"duration_ms": 3500,
"start_time": "2026-02-13T10:00:00.000Z",
"end_time": "2026-02-13T10:00:03.500Z"
"llm_call_count": 2
}
```

Expand All @@ -217,11 +217,6 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
| `tool_calls_by_name` | `Record<string, number>` | Count per tool |
| `error_count` | `number` | Failed tool calls |
| `llm_call_count` | `number` | Number of LLM calls (assistant messages) |
| `token_usage` | `{input, output}` | Token consumption |
| `cost_usd` | `number` | Estimated cost |
| `duration_ms` | `number` | Total execution duration |
| `start_time` | `string` | ISO timestamp of first event |
| `end_time` | `string` | ISO timestamp of last event |

Use `expected_output` for retrieval context in RAG evals (tool calls with outputs) and `output` for the actual agent execution trace from live runs.

Expand Down
2 changes: 1 addition & 1 deletion apps/web/src/content/docs/tools/trace.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ All commands support `--format json` for piping to `jq`:
```bash
# Find tests costing more than $0.10
agentv trace show results.jsonl --format json \
| jq '[.[] | select(.trace.cost_usd > 0.10) | {test_id, score, cost: .trace.cost_usd}]'
| jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'

# Compare providers
agentv trace stats results.jsonl --group-by target --format json \
Expand Down
22 changes: 11 additions & 11 deletions examples/features/execution-metrics/scripts/check-efficiency.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const THRESHOLDS = {
maxDurationMs: 10000,
};

export default defineCodeJudge(({ trace }) => {
export default defineCodeJudge(({ trace, tokenUsage, costUsd, durationMs }) => {
const hits: string[] = [];
const misses: string[] = [];
const checks: boolean[] = [];
Expand All @@ -39,8 +39,8 @@ export default defineCodeJudge(({ trace }) => {
}

// Check token usage if available
if (trace.tokenUsage) {
const totalTokens = trace.tokenUsage.input + trace.tokenUsage.output;
if (tokenUsage) {
const totalTokens = tokenUsage.input + tokenUsage.output;
if (totalTokens <= THRESHOLDS.maxTokens) {
hits.push(`Token usage (${totalTokens}) within limit`);
checks.push(true);
Expand All @@ -51,23 +51,23 @@ export default defineCodeJudge(({ trace }) => {
}

// Check cost if available
if (trace.costUsd !== undefined) {
if (trace.costUsd <= THRESHOLDS.maxCostUsd) {
hits.push(`Cost ($${trace.costUsd.toFixed(4)}) within budget`);
if (costUsd !== undefined) {
if (costUsd <= THRESHOLDS.maxCostUsd) {
hits.push(`Cost ($${costUsd.toFixed(4)}) within budget`);
checks.push(true);
} else {
misses.push(`High cost: $${trace.costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`);
misses.push(`High cost: $${costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`);
checks.push(false);
}
}

// Check duration if available
if (trace.durationMs !== undefined) {
if (trace.durationMs <= THRESHOLDS.maxDurationMs) {
hits.push(`Duration (${trace.durationMs}ms) within limit`);
if (durationMs !== undefined) {
if (durationMs <= THRESHOLDS.maxDurationMs) {
hits.push(`Duration (${durationMs}ms) within limit`);
checks.push(true);
} else {
misses.push(`Slow execution: ${trace.durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`);
misses.push(`Slow execution: ${durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`);
checks.push(false);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*/
import { defineCodeJudge } from '@agentv/eval';

export default defineCodeJudge(({ trace }) => {
export default defineCodeJudge(({ trace, tokenUsage, costUsd, durationMs }) => {
const hits: string[] = [];
const misses: string[] = [];

Expand All @@ -27,22 +27,22 @@ export default defineCodeJudge(({ trace }) => {
}

// Check for tokenUsage
if (trace.tokenUsage) {
hits.push(`tokenUsage present: ${trace.tokenUsage.input}/${trace.tokenUsage.output}`);
if (tokenUsage) {
hits.push(`tokenUsage present: ${tokenUsage.input}/${tokenUsage.output}`);
} else {
misses.push('tokenUsage not present');
}

// Check for costUsd
if (trace.costUsd !== undefined) {
hits.push(`costUsd present: $${trace.costUsd.toFixed(4)}`);
if (costUsd !== undefined) {
hits.push(`costUsd present: $${costUsd.toFixed(4)}`);
} else {
misses.push('costUsd not present');
}

// Check for durationMs
if (trace.durationMs !== undefined) {
hits.push(`durationMs present: ${trace.durationMs}ms`);
if (durationMs !== undefined) {
hits.push(`durationMs present: ${durationMs}ms`);
} else {
misses.push('durationMs not present');
}
Expand Down
2 changes: 1 addition & 1 deletion examples/features/trace-analysis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Pipe JSON output to `jq` for complex queries:
```bash
# Find tests that cost more than $0.10
bun agentv trace show evals/multi-agent.eval.results.jsonl --format json \
| jq '[.[] | select(.trace.cost_usd > 0.10) | {test_id, score, cost: .trace.cost_usd}]'
| jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'

# Compare scores by target provider
bun agentv trace stats evals/multi-agent.eval.results.jsonl --group-by target --format json \
Expand Down
10 changes: 5 additions & 5 deletions examples/features/trace-evaluation/judges/span-duration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { defineCodeJudge } from '@agentv/eval';

const DEFAULT_MAX_SPAN_MS = 5000;

export default defineCodeJudge(({ trace, config }) => {
export default defineCodeJudge(({ trace, config, durationMs }) => {
if (!trace) {
return {
score: 0,
Expand All @@ -23,12 +23,12 @@ export default defineCodeJudge(({ trace, config }) => {
const misses: string[] = [];

// Check overall duration
if (trace.durationMs !== undefined) {
if (durationMs !== undefined) {
const maxTotalMs = (config?.maxTotalMs as number) ?? maxSpanMs * 5;
if (trace.durationMs <= maxTotalMs) {
hits.push(`Total duration (${trace.durationMs}ms) within limit (${maxTotalMs}ms)`);
if (durationMs <= maxTotalMs) {
hits.push(`Total duration (${durationMs}ms) within limit (${maxTotalMs}ms)`);
} else {
misses.push(`Total duration too long: ${trace.durationMs}ms (max: ${maxTotalMs}ms)`);
misses.push(`Total duration too long: ${durationMs}ms (max: ${maxTotalMs}ms)`);
}
}

Expand Down
Loading