EntityProcess · christso · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts
@@ -56,7 +56,7 @@ export class JunitWriter {
       const errors = results.filter((r) => r.error !== undefined).length;
 
       const testCases = results.map((r) => {
-        const time = r.trace?.durationMs ? (r.trace.durationMs / 1000).toFixed(3) : '0.000';
+        const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000';
 
         let inner = '';
         if (r.error) {

diff --git a/apps/cli/src/commands/trace/score.ts b/apps/cli/src/commands/trace/score.ts
@@ -224,6 +224,13 @@ async function runScore(
       now: new Date(),
       output: Array.isArray(output) ? output : undefined,
       trace,
+      tokenUsage: raw.token_usage
+        ? (toCamelCaseDeep(raw.token_usage) as EvaluationContext['tokenUsage'])
+        : undefined,
+      costUsd: raw.cost_usd,
+      durationMs: raw.duration_ms,
+      startTime: raw.start_time,
+      endTime: raw.end_time,
     };
 
     const score = await evaluator.evaluate(evalContext);
@@ -350,7 +357,13 @@ export const traceScoreCommand = command({
       evaluatorConfig.type,
     );
     if (traceRequired) {
-      const hasTrace = results.some((r) => r.trace);
+      const hasTrace = results.some(
+        (r) =>
+          r.trace ||
+          r.cost_usd !== undefined ||
+          r.duration_ms !== undefined ||
+          r.token_usage !== undefined,
+      );
       if (!hasTrace) {
         console.error(
           `${c.red}Error:${c.reset} Result file lacks trace data. Re-run eval with ${c.bold}--trace${c.reset} to capture trace summaries.`,

diff --git a/apps/cli/src/commands/trace/show.ts b/apps/cli/src/commands/trace/show.ts
@@ -1,7 +1,6 @@
 import { command, flag, oneOf, option, optional, positional, string } from 'cmd-ts';
 import {
   type RawResult,
-  type RawTraceSummary,
   c,
   formatCost,
   formatDuration,
@@ -13,31 +12,32 @@ import {
 /**
  * Render flat trace summary line (fallback when full output messages not available).
  */
-function renderFlatTrace(trace: RawTraceSummary): string {
+function renderFlatTrace(result: RawResult): string {
+  const trace = result.trace;
   const parts: string[] = [];
 
-  if (trace.tool_names && trace.tool_names.length > 0) {
+  if (trace?.tool_names && trace.tool_names.length > 0) {
     const toolParts = trace.tool_names.map((name) => {
       const count = trace.tool_calls_by_name?.[name] ?? 0;
       return count > 1 ? `${name} ×${count}` : name;
     });
     parts.push(`Tools: ${toolParts.join(', ')}`);
   }
 
-  if (trace.duration_ms !== undefined) {
-    parts.push(`Duration: ${formatDuration(trace.duration_ms)}`);
+  if (result.duration_ms !== undefined) {
+    parts.push(`Duration: ${formatDuration(result.duration_ms)}`);
   }
 
-  if (trace.token_usage) {
-    const total = trace.token_usage.input + trace.token_usage.output;
+  if (result.token_usage) {
+    const total = result.token_usage.input + result.token_usage.output;
     parts.push(`Tokens: ${formatNumber(total)}`);
   }
 
-  if (trace.cost_usd !== undefined) {
-    parts.push(`Cost: ${formatCost(trace.cost_usd)}`);
+  if (result.cost_usd !== undefined) {
+    parts.push(`Cost: ${formatCost(result.cost_usd)}`);
   }
 
-  if (trace.llm_call_count !== undefined) {
+  if (trace?.llm_call_count !== undefined) {
     parts.push(`LLM calls: ${trace.llm_call_count}`);
   }
 
@@ -85,8 +85,8 @@ function renderTree(result: RawResult): string {
 
   if (!messages || messages.length === 0) {
     // Fallback to flat summary
-    if (result.trace) {
-      return renderFlatTrace(result.trace);
+    if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) {
+      return renderFlatTrace(result);
     }
     return `${c.dim}No trace data available${c.reset}`;
   }
@@ -95,14 +95,14 @@ function renderTree(result: RawResult): string {
   const testId = result.test_id ?? result.eval_id ?? 'unknown';
 
   // Root node: test execution
-  const totalDuration = result.trace?.duration_ms;
-  const totalTokens = result.trace?.token_usage
-    ? result.trace.token_usage.input + result.trace.token_usage.output
+  const totalDuration = result.duration_ms;
+  const totalTokens = result.token_usage
+    ? result.token_usage.input + result.token_usage.output
     : undefined;
   const rootParts: string[] = [testId];
   if (totalDuration !== undefined) rootParts.push(formatDuration(totalDuration));
   if (totalTokens !== undefined) rootParts.push(`${formatNumber(totalTokens)} tok`);
-  if (result.trace?.cost_usd !== undefined) rootParts.push(formatCost(result.trace.cost_usd));
+  if (result.cost_usd !== undefined) rootParts.push(formatCost(result.cost_usd));
   lines.push(`${c.bold}${rootParts.join(', ')}${c.reset}`);
 
   // Filter to meaningful messages (assistant with tool calls, or assistant responses)
@@ -204,8 +204,8 @@ function formatResultDetail(result: RawResult, index: number, tree: boolean): st
     lines.push(`  ${c.dim}Scores:${c.reset} ${renderScores(result.scores)}`);
   }
 
-  if (result.trace) {
-    lines.push(`  ${c.dim}Trace:${c.reset} ${renderFlatTrace(result.trace)}`);
+  if (result.trace || result.duration_ms !== undefined || result.cost_usd !== undefined) {
+    lines.push(`  ${c.dim}Trace:${c.reset} ${renderFlatTrace(result)}`);
   }
 
   if (result.reasoning) {

diff --git a/apps/cli/src/commands/trace/stats.ts b/apps/cli/src/commands/trace/stats.ts
@@ -43,9 +43,7 @@ function collectMetrics(results: RawResult[]): MetricRow[] {
   }
 
   // Latency
-  const latencies = results
-    .map((r) => r.trace?.duration_ms)
-    .filter((v): v is number => v !== undefined);
+  const latencies = results.map((r) => r.duration_ms).filter((v): v is number => v !== undefined);
   if (latencies.length > 0) {
     rows.push({
       name: 'latency_s',
@@ -55,16 +53,16 @@ function collectMetrics(results: RawResult[]): MetricRow[] {
   }
 
   // Cost
-  const costs = results.map((r) => r.trace?.cost_usd).filter((v): v is number => v !== undefined);
+  const costs = results.map((r) => r.cost_usd).filter((v): v is number => v !== undefined);
   if (costs.length > 0) {
     rows.push({ name: 'cost_usd', values: costs, formatter: (n) => formatCost(n) });
   }
 
   // Total tokens
   const tokens = results
     .map((r) => {
-      if (!r.trace?.token_usage) return undefined;
-      return r.trace.token_usage.input + r.trace.token_usage.output;
+      if (!r.token_usage) return undefined;
+      return r.token_usage.input + r.token_usage.output;
     })
     .filter((v): v is number => v !== undefined);
   if (tokens.length > 0) {

diff --git a/apps/cli/src/commands/trace/utils.ts b/apps/cli/src/commands/trace/utils.ts
@@ -53,6 +53,12 @@ export interface RawResult {
   error?: string;
   scores?: RawEvaluatorScore[];
   trace?: RawTraceSummary;
+  // Promoted execution metrics (snake_case from JSONL)
+  token_usage?: { input: number; output: number; cached?: number };
+  cost_usd?: number;
+  duration_ms?: number;
+  start_time?: string;
+  end_time?: string;
   input?: unknown;
   output?: unknown;
   trials?: unknown[];
@@ -75,12 +81,7 @@ export interface RawTraceSummary {
   tool_names?: string[];
   tool_calls_by_name?: Record<string, number>;
   error_count?: number;
-  token_usage?: { input: number; output: number; cached?: number };
-  cost_usd?: number;
-  duration_ms?: number;
   tool_durations?: Record<string, number[]>;
-  start_time?: string;
-  end_time?: string;
   llm_call_count?: number;
 }
 

diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts
@@ -27,11 +27,11 @@ const RESULT_WITH_TRACE = JSON.stringify({
     tool_names: ['read', 'write'],
     tool_calls_by_name: { read: 3, write: 2 },
     error_count: 0,
-    token_usage: { input: 1000, output: 500 },
-    cost_usd: 0.05,
-    duration_ms: 3200,
     llm_call_count: 2,
   },
+  token_usage: { input: 1000, output: 500 },
+  cost_usd: 0.05,
+  duration_ms: 3200,
 });
 
 const RESULT_WITHOUT_TRACE = JSON.stringify({
@@ -79,7 +79,7 @@ describe('trace utils', () => {
       expect(results[0].score).toBe(1);
       expect(results[0].trace).toBeDefined();
       expect(results[0].trace?.event_count).toBe(5);
-      expect(results[0].trace?.cost_usd).toBe(0.05);
+      expect(results[0].cost_usd).toBe(0.05);
 
       expect(results[1].test_id).toBe('test-2');
       expect(results[1].score).toBe(0.75);

diff --git a/apps/web/src/content/docs/evaluators/code-judges.mdx b/apps/web/src/content/docs/evaluators/code-judges.mdx
@@ -189,7 +189,12 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
 | `input` | `Message[]` | Full resolved input message array |
 | `expected_output` | `Message[]` | Expected agent behavior including tool calls |
 | `output` | `Message[]` | Actual agent execution trace with tool calls |
-| `trace` | `TraceSummary` | Lightweight execution metrics |
+| `trace` | `TraceSummary` | Lightweight execution metrics (tool calls, errors) |
+| `token_usage` | `{input, output}` | Token consumption |
+| `cost_usd` | `number` | Estimated cost in USD |
+| `duration_ms` | `number` | Total execution duration |
+| `start_time` | `string` | ISO timestamp of first event |
+| `end_time` | `string` | ISO timestamp of last event |
 | `file_changes` | `string \| null` | Unified diff of workspace file changes (when `workspace_template` is configured) |
 | `workspace_path` | `string \| null` | Absolute path to the workspace directory (when `workspace_template` is configured) |
 
@@ -201,12 +206,7 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
   "tool_names": ["fetch", "search"],
   "tool_calls_by_name": { "search": 2, "fetch": 1 },
   "error_count": 0,
-  "llm_call_count": 2,
-  "token_usage": { "input": 1000, "output": 500 },
-  "cost_usd": 0.0015,
-  "duration_ms": 3500,
-  "start_time": "2026-02-13T10:00:00.000Z",
-  "end_time": "2026-02-13T10:00:03.500Z"
+  "llm_call_count": 2
 }
 ```
 
@@ -217,11 +217,6 @@ Beyond the basic `question`, `criteria`, `answer`, and `reference_answer` fields
 | `tool_calls_by_name` | `Record<string, number>` | Count per tool |
 | `error_count` | `number` | Failed tool calls |
 | `llm_call_count` | `number` | Number of LLM calls (assistant messages) |
-| `token_usage` | `{input, output}` | Token consumption |
-| `cost_usd` | `number` | Estimated cost |
-| `duration_ms` | `number` | Total execution duration |
-| `start_time` | `string` | ISO timestamp of first event |
-| `end_time` | `string` | ISO timestamp of last event |
 
 Use `expected_output` for retrieval context in RAG evals (tool calls with outputs) and `output` for the actual agent execution trace from live runs.
 

diff --git a/apps/web/src/content/docs/tools/trace.mdx b/apps/web/src/content/docs/tools/trace.mdx
@@ -83,7 +83,7 @@ All commands support `--format json` for piping to `jq`:
 ```bash
 # Find tests costing more than $0.10
 agentv trace show results.jsonl --format json \
-  | jq '[.[] | select(.trace.cost_usd > 0.10) | {test_id, score, cost: .trace.cost_usd}]'
+  | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'
 
 # Compare providers
 agentv trace stats results.jsonl --group-by target --format json \

diff --git a/examples/features/execution-metrics/scripts/check-efficiency.ts b/examples/features/execution-metrics/scripts/check-efficiency.ts
@@ -15,7 +15,7 @@ const THRESHOLDS = {
   maxDurationMs: 10000,
 };
 
-export default defineCodeJudge(({ trace }) => {
+export default defineCodeJudge(({ trace, tokenUsage, costUsd, durationMs }) => {
   const hits: string[] = [];
   const misses: string[] = [];
   const checks: boolean[] = [];
@@ -39,8 +39,8 @@ export default defineCodeJudge(({ trace }) => {
   }
 
   // Check token usage if available
-  if (trace.tokenUsage) {
-    const totalTokens = trace.tokenUsage.input + trace.tokenUsage.output;
+  if (tokenUsage) {
+    const totalTokens = tokenUsage.input + tokenUsage.output;
     if (totalTokens <= THRESHOLDS.maxTokens) {
       hits.push(`Token usage (${totalTokens}) within limit`);
       checks.push(true);
@@ -51,23 +51,23 @@ export default defineCodeJudge(({ trace }) => {
   }
 
   // Check cost if available
-  if (trace.costUsd !== undefined) {
-    if (trace.costUsd <= THRESHOLDS.maxCostUsd) {
-      hits.push(`Cost ($${trace.costUsd.toFixed(4)}) within budget`);
+  if (costUsd !== undefined) {
+    if (costUsd <= THRESHOLDS.maxCostUsd) {
+      hits.push(`Cost ($${costUsd.toFixed(4)}) within budget`);
       checks.push(true);
     } else {
-      misses.push(`High cost: $${trace.costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`);
+      misses.push(`High cost: $${costUsd.toFixed(4)} (max: $${THRESHOLDS.maxCostUsd})`);
       checks.push(false);
     }
   }
 
   // Check duration if available
-  if (trace.durationMs !== undefined) {
-    if (trace.durationMs <= THRESHOLDS.maxDurationMs) {
-      hits.push(`Duration (${trace.durationMs}ms) within limit`);
+  if (durationMs !== undefined) {
+    if (durationMs <= THRESHOLDS.maxDurationMs) {
+      hits.push(`Duration (${durationMs}ms) within limit`);
       checks.push(true);
     } else {
-      misses.push(`Slow execution: ${trace.durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`);
+      misses.push(`Slow execution: ${durationMs}ms (max: ${THRESHOLDS.maxDurationMs}ms)`);
       checks.push(false);
     }
   }

diff --git a/examples/features/execution-metrics/scripts/check-metrics-present.ts b/examples/features/execution-metrics/scripts/check-metrics-present.ts
@@ -13,7 +13,7 @@
  */
 import { defineCodeJudge } from '@agentv/eval';
 
-export default defineCodeJudge(({ trace }) => {
+export default defineCodeJudge(({ trace, tokenUsage, costUsd, durationMs }) => {
   const hits: string[] = [];
   const misses: string[] = [];
 
@@ -27,22 +27,22 @@ export default defineCodeJudge(({ trace }) => {
   }
 
   // Check for tokenUsage
-  if (trace.tokenUsage) {
-    hits.push(`tokenUsage present: ${trace.tokenUsage.input}/${trace.tokenUsage.output}`);
+  if (tokenUsage) {
+    hits.push(`tokenUsage present: ${tokenUsage.input}/${tokenUsage.output}`);
   } else {
     misses.push('tokenUsage not present');
   }
 
   // Check for costUsd
-  if (trace.costUsd !== undefined) {
-    hits.push(`costUsd present: $${trace.costUsd.toFixed(4)}`);
+  if (costUsd !== undefined) {
+    hits.push(`costUsd present: $${costUsd.toFixed(4)}`);
   } else {
     misses.push('costUsd not present');
   }
 
   // Check for durationMs
-  if (trace.durationMs !== undefined) {
-    hits.push(`durationMs present: ${trace.durationMs}ms`);
+  if (durationMs !== undefined) {
+    hits.push(`durationMs present: ${durationMs}ms`);
   } else {
     misses.push('durationMs not present');
   }

diff --git a/examples/features/trace-analysis/README.md b/examples/features/trace-analysis/README.md
@@ -48,7 +48,7 @@ Pipe JSON output to `jq` for complex queries:
 ```bash
 # Find tests that cost more than $0.10
 bun agentv trace show evals/multi-agent.eval.results.jsonl --format json \
-  | jq '[.[] | select(.trace.cost_usd > 0.10) | {test_id, score, cost: .trace.cost_usd}]'
+  | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]'
 
 # Compare scores by target provider
 bun agentv trace stats evals/multi-agent.eval.results.jsonl --group-by target --format json \

diff --git a/examples/features/trace-evaluation/judges/span-duration.ts b/examples/features/trace-evaluation/judges/span-duration.ts
@@ -9,7 +9,7 @@ import { defineCodeJudge } from '@agentv/eval';
 
 const DEFAULT_MAX_SPAN_MS = 5000;
 
-export default defineCodeJudge(({ trace, config }) => {
+export default defineCodeJudge(({ trace, config, durationMs }) => {
   if (!trace) {
     return {
       score: 0,
@@ -23,12 +23,12 @@ export default defineCodeJudge(({ trace, config }) => {
   const misses: string[] = [];
 
   // Check overall duration
-  if (trace.durationMs !== undefined) {
+  if (durationMs !== undefined) {
     const maxTotalMs = (config?.maxTotalMs as number) ?? maxSpanMs * 5;
-    if (trace.durationMs <= maxTotalMs) {
-      hits.push(`Total duration (${trace.durationMs}ms) within limit (${maxTotalMs}ms)`);
+    if (durationMs <= maxTotalMs) {
+      hits.push(`Total duration (${durationMs}ms) within limit (${maxTotalMs}ms)`);
     } else {
-      misses.push(`Total duration too long: ${trace.durationMs}ms (max: ${maxTotalMs}ms)`);
+      misses.push(`Total duration too long: ${durationMs}ms (max: ${maxTotalMs}ms)`);
     }
   }