From 5d52940194c1fcb82b09e057f248d665d959db31 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Wed, 11 Feb 2026 17:04:07 -0800 Subject: [PATCH 01/21] save initial draft --- agents/src/ipc/job_proc_lazy_main.ts | 4 +- agents/src/llm/chat_context.ts | 35 + agents/src/llm/realtime.ts | 1 + agents/src/utils.test.ts | 87 +++ agents/src/utils.ts | 38 +- agents/src/voice/README.md | 224 ++++++ agents/src/voice/agent.test.ts | 142 +++- agents/src/voice/agent.ts | 196 +++++- agents/src/voice/agent_activity.test.ts | 144 ++++ agents/src/voice/agent_activity.ts | 666 +++++++++++------- agents/src/voice/agent_session.test.ts | 171 +++++ agents/src/voice/agent_session.ts | 206 ++++-- agents/src/voice/generation.ts | 51 +- agents/src/voice/index.ts | 2 +- agents/src/voice/speech_handle.ts | 7 +- agents/src/voice/testing/run_result.test.ts | 76 ++ agents/src/voice/testing/run_result.ts | 91 ++- examples/src/agent_task_survey.ts | 92 +++ .../google/src/beta/realtime/realtime_api.ts | 1 + .../livekit/src/turn_detector/multilingual.ts | 11 +- plugins/openai/src/realtime/realtime_model.ts | 1 + .../src/realtime/realtime_model_beta.ts | 1 + 22 files changed, 1880 insertions(+), 367 deletions(-) create mode 100644 agents/src/voice/README.md create mode 100644 agents/src/voice/agent_activity.test.ts create mode 100644 agents/src/voice/agent_session.test.ts create mode 100644 agents/src/voice/testing/run_result.test.ts create mode 100644 examples/src/agent_task_survey.ts diff --git a/agents/src/ipc/job_proc_lazy_main.ts b/agents/src/ipc/job_proc_lazy_main.ts index 8dd5cf6f8..f81eedc29 100644 --- a/agents/src/ipc/job_proc_lazy_main.ts +++ b/agents/src/ipc/job_proc_lazy_main.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { Room, RoomEvent } from '@livekit/rtc-node'; +import { Room, RoomEvent, dispose } from '@livekit/rtc-node'; import { EventEmitter, once } from 'node:events'; import { pathToFileURL } from 'node:url'; import type { Logger } from 'pino'; @@ -245,6 +245,8 @@ const startJob = ( await join.await; + await dispose(); + logger.debug('Job process shutdown'); process.exit(0); } diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 0f4a644ef..ce36d92c6 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -510,6 +510,41 @@ export class ChatContext { return new ChatContext(items); } + merge( + other: ChatContext, + options: { + excludeFunctionCall?: boolean; + excludeInstructions?: boolean; + } = {}, + ): ChatContext { + const { excludeFunctionCall = false, excludeInstructions = false } = options; + const existingIds = new Set(this._items.map((item) => item.id)); + + for (const item of other.items) { + if (excludeFunctionCall && ['function_call', 'function_call_output'].includes(item.type)) { + continue; + } + + if ( + excludeInstructions && + item.type === 'message' && + (item.role === 'system' || item.role === 'developer') + ) { + continue; + } + + if (existingIds.has(item.id)) { + continue; + } + + const idx = this.findInsertionIndex(item.createdAt); + this._items.splice(idx, 0, item); + existingIds.add(item.id); + } + + return this; + } + truncate(maxItems: number): ChatContext { if (maxItems <= 0) return this; diff --git a/agents/src/llm/realtime.ts b/agents/src/llm/realtime.ts index bebeffcf4..5c132afd0 100644 --- a/agents/src/llm/realtime.ts +++ b/agents/src/llm/realtime.ts @@ -48,6 +48,7 @@ export interface RealtimeCapabilities { userTranscription: boolean; autoToolReplyGeneration: boolean; audioOutput: boolean; + manualFunctionCalls: boolean; } export interface InputTranscriptionCompleted { diff --git a/agents/src/utils.test.ts b/agents/src/utils.test.ts index 6bab4d642..a44678d08 100644 --- a/agents/src/utils.test.ts +++ b/agents/src/utils.test.ts @@ -469,6 +469,93 @@ describe('utils', () => { expect((error as Error).name).toBe('TypeError'); } }); + + it('should return undefined for Task.current outside task context', () => { + expect(Task.current()).toBeUndefined(); + }); + + it('should preserve Task.current inside a task across awaits', async () => { + const task = Task.from( + async () => { + const currentAtStart = Task.current(); + await delay(5); + const currentAfterAwait = Task.current(); + + expect(currentAtStart).toBeDefined(); + expect(currentAfterAwait).toBe(currentAtStart); + + return currentAtStart; + }, + undefined, + 'current-context-test', + ); + + const currentFromResult = await task.result; + expect(currentFromResult).toBe(task); + }); + + it('should isolate nested Task.current context and restore parent context', async () => { + const parentTask = Task.from( + async (controller) => { + const parentCurrent = Task.current(); + expect(parentCurrent).toBeDefined(); + + const childTask = Task.from( + async () => { + const childCurrentStart = Task.current(); + await delay(5); + const childCurrentAfterAwait = Task.current(); + + expect(childCurrentStart).toBeDefined(); + expect(childCurrentAfterAwait).toBe(childCurrentStart); + expect(childCurrentStart).not.toBe(parentCurrent); + + return childCurrentStart; + }, + controller, + 'child-current-context-test', + ); + + const childCurrent = await childTask.result; + const parentCurrentAfterChild = Task.current(); + + expect(parentCurrentAfterChild).toBe(parentCurrent); + + return { parentCurrent, childCurrent }; + }, + undefined, + 'parent-current-context-test', + ); + + const { parentCurrent, childCurrent } = await parentTask.result; + expect(parentCurrent).toBe(parentTask); + expect(childCurrent).not.toBe(parentCurrent); + expect(Task.current()).toBeUndefined(); + }); + + it('should always expose Task.current for concurrent task callbacks', async () => { + const tasks = Array.from({ length: 25 }, (_, idx) => + Task.from( + async () => { + const currentAtStart = Task.current(); + await delay(1); + const currentAfterAwait = Task.current(); + + expect(currentAtStart).toBeDefined(); + expect(currentAfterAwait).toBe(currentAtStart); + + return currentAtStart; + }, + undefined, + `current-context-stress-${idx}`, + ), + ); + + const currentTasks = await Promise.all(tasks.map((task) => task.result)); + currentTasks.forEach((currentTask, idx) => { + expect(currentTask).toBe(tasks[idx]); + }); + }); }); describe('Event', () => { diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 75033ff9a..142e07790 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -9,6 +9,7 @@ import type { TrackKind, } from '@livekit/rtc-node'; import { AudioFrame, AudioResampler, RoomEvent } from '@livekit/rtc-node'; +import { AsyncLocalStorage } from 'node:async_hooks'; import { EventEmitter, once } from 'node:events'; import type { ReadableStream } from 'node:stream/web'; import { TransformStream, type TransformStreamDefaultController } from 'node:stream/web'; @@ -418,7 +419,9 @@ export enum TaskResult { * @param T - The type of the task result */ export class Task { + private static readonly currentTaskStorage = new AsyncLocalStorage>(); private resultFuture: Future; + private doneCallbacks: Set<() => void> = new Set(); #logger = log(); @@ -428,6 +431,21 @@ export class Task { readonly name?: string, ) { this.resultFuture = new Future(); + void this.resultFuture.await + .then( + () => undefined, + () => undefined, + ) + .finally(() => { + for (const callback of this.doneCallbacks) { + try { + callback(); + } catch (error) { + this.#logger.error({ error }, 'Task done callback failed'); + } + } + this.doneCallbacks.clear(); + }); this.runTask(); } @@ -447,6 +465,13 @@ export class Task { return new Task(fn, abortController, name); } + /** + * Returns the currently running task in this async context, if available. + */ + static current(): Task | undefined { + return Task.currentTaskStorage.getStore(); + } + private async runTask() { const run = async () => { if (this.name) { @@ -455,7 +480,8 @@ export class Task { return await this.fn(this.controller); }; - return run() + return Task.currentTaskStorage + .run(this as Task, run) .then((value) => { this.resultFuture.resolve(value); return value; @@ -527,7 +553,15 @@ export class Task { } addDoneCallback(callback: () => void) { - this.resultFuture.await.finally(callback); + if (this.done) { + queueMicrotask(callback); + return; + } + this.doneCallbacks.add(callback); + } + + removeDoneCallback(callback: () => void) { + this.doneCallbacks.delete(callback); } } diff --git a/agents/src/voice/README.md b/agents/src/voice/README.md new file mode 100644 index 000000000..15e1b2e6b --- /dev/null +++ b/agents/src/voice/README.md @@ -0,0 +1,224 @@ +# AgentTask Runtime Flow (Python Reference) + +This document explains how Python `AgentTask` works at runtime so you can read related code and trace behavior confidently. + +Primary reference files: + +- `livekit-agents/livekit/agents/voice/agent.py` +- `livekit-agents/livekit/agents/voice/agent_session.py` +- `livekit-agents/livekit/agents/voice/agent_activity.py` +- `livekit-agents/livekit/agents/voice/generation.py` +- `livekit-agents/livekit/agents/llm/chat_context.py` +- `livekit-agents/livekit/agents/beta/workflows/task_group.py` + +--- + +## 1) Mental model + +`AgentTask[T]` is a temporary, awaitable sub-agent that: + +1. pauses the currently active agent activity, +2. runs its own activity (`on_enter`, LLM/tools, speech), +3. resolves a typed result via `complete(...)`, +4. resumes the caller activity, +5. merges useful chat history back to the caller. + +Treat it as an inline conversational coroutine that borrows the session and returns. + +--- + +## 2) Two activity transition modes (must distinguish) + +### Mode A: close/start (normal handoff) + +Used by `session.update_agent(new_agent)` and handoff returns. + +- old activity: `drain()` + `aclose()` +- new activity: `start()` +- no implicit return to the old agent + +### Mode B: pause/resume (inline AgentTask) + +Used by `await some_task`. + +- old activity: `pause()` (kept alive) +- task activity: `start()` +- task finishes (`complete(...)`) +- task activity: `drain()` + `aclose()` +- old activity: `resume()` + +`AgentTask` relies on Mode B. + +--- + +## 3) End-to-end sequence + +```mermaid +sequenceDiagram + participant callerAgent as CallerAgent + participant agentTask as AgentTask + participant session as AgentSession + participant callerActivity as CallerActivity + participant taskActivity as TaskActivity + + callerAgent->>agentTask: await task + agentTask->>session: _update_activity(task, previous_activity="pause") + session->>callerActivity: pause() + session->>taskActivity: start() + taskActivity->>agentTask: on_enter() + Note over taskActivity: LLM/tools/speech execution + agentTask->>agentTask: complete(result_or_exception) + agentTask-->>callerAgent: await returns or raises + agentTask->>session: _update_activity(old_agent, new_activity="resume") + session->>taskActivity: drain()+aclose() + session->>callerActivity: resume() +``` + +--- + +## 4) What `AgentTask` adds over `Agent` + +In `agent.py`, `AgentTask` extends `Agent` and introduces: + +- internal future (`__fut`) to represent task completion, +- non-reentrancy guard (`__started`), +- `done()` state, +- `complete(value_or_exception)`, +- `__await__`/`__await_impl`. + +Without `complete(...)`, `await task` never resolves. + +--- + +## 5) `__await_impl` control flow (core) + +When caller does `result = await task`: + +1. validate usage context and reentrancy, +2. capture old activity/agent, +3. switch to task activity with `previous_activity="pause"`, +4. await task future, +5. in `finally`, if session is still on this task: + - merge task chat context into old agent chat context + - resume old activity with `new_activity="resume"` + +This `finally` resume logic is the stack-like return behavior. + +--- + +## 6) `complete(...)` semantics + +- `complete(value)` -> awaiter receives `value` +- `complete(exception)` -> awaiter raises + +It also updates current speech-handle final-output plumbing when present. + +--- + +## 7) `AgentSession._update_activity` behavior matrix + +Key params: + +- `previous_activity`: `"close"` | `"pause"` +- `new_activity`: `"start"` | `"resume"` + +Meaning: + +- `close + start`: full handoff to new activity +- `pause + start`: enter inline task +- `close + resume`: return to previously paused activity + +For resume path, an existing `agent._activity` is required. + +--- + +## 8) `AgentActivity.pause()` vs `resume()` + +`pause()`: + +- pauses scheduling/draining logic, +- closes runtime session resources/listeners, +- preserves activity object for later resume. + +`resume()`: + +- re-establishes runtime session resources/listeners, +- restarts scheduling, +- does **not** run `on_enter()` again. + +This is why caller state can continue seamlessly. + +--- + +## 9) Hook execution model + +`on_enter` and `on_exit` run as speech tasks in activity runtime. +They are inline-task-compatible, so nested `await AgentTask(...)` is valid. + +--- + +## 10) Tools, instructions, models during task + +While task is active: + +- instructions are applied like any `Agent`, +- tools are resolved from session + task (+ mcp tools), +- model resolution is task-first, session-fallback. + +So tasks can temporarily override LLM/STT/TTS/VAD/tool behavior. + +--- + +## 11) Chat context merge on return + +On task completion, caller chat context merges task context with rules: + +- dedupe by `id`, +- insert by chronological `created_at`, +- exclude function-call internals, +- exclude instructions (`system`/`developer`) for resume path. + +This preserves useful conversation outcomes without tool noise. + +--- + +## 12) Difference from tool handoff returns + +Tool return handoff (`Agent` return) and `await AgentTask` both switch agents, but: + +- handoff return -> close/start semantics (role transfer), +- `await AgentTask` -> pause/resume semantics (inline subroutine + return). + +Do not conflate these paths while debugging. + +--- + +## 13) Canonical Python usage patterns + +- Survey workflow (`examples/survey/survey_agent.py`): staged typed tasks with `TaskGroup`. +- IVR workflow (`examples/bank-ivr/ivr_system_agent.py`): direct inline `await task` in menu loops. +- `TaskGroup` itself (`beta/workflows/task_group.py`) is implemented on top of `AgentTask`. + +--- + +## 14) Common pitfalls + +- calling `complete(...)` twice -> error, +- awaiting same task instance twice -> error, +- missing `complete(...)` path -> hang, +- concurrent external `update_agent(...)` during task may bypass normal resume path. + +--- + +## 15) Practical tracing checklist + +When reading AgentTask code, confirm: + +1. where task is created, +2. where `await task` happens, +3. where `complete(...)` is guaranteed, +4. whether transition mode is pause/resume vs close/start, +5. how merge filters are configured, +6. whether concurrent handoff can race with task return. + +If all six are clear, your runtime mental simulation is correct. diff --git a/agents/src/voice/agent.test.ts b/agents/src/voice/agent.test.ts index cc620e26a..fd5f39183 100644 --- a/agents/src/voice/agent.test.ts +++ b/agents/src/voice/agent.test.ts @@ -1,10 +1,15 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; import { z } from 'zod'; import { tool } from '../llm/index.js'; -import { Agent } from './agent.js'; +import { initializeLogger } from '../log.js'; +import { Task } from '../utils.js'; +import { Agent, AgentTask, _setActivityTaskInfo } from './agent.js'; +import { agentActivityStorage } from './agent_activity.js'; + +initializeLogger({ pretty: false, level: 'error' }); describe('Agent', () => { it('should create agent with basic instructions', () => { @@ -77,4 +82,137 @@ describe('Agent', () => { expect(tools1).toEqual(tools2); expect(tools1).toEqual(tools); }); + + it('should require AgentTask to run inside task context', async () => { + class TestTask extends AgentTask { + constructor() { + super({ instructions: 'test task' }); + } + } + + const task = new TestTask(); + await expect(task.run()).rejects.toThrow('must be executed inside a Task context'); + }); + + it('should require AgentTask to run inside inline task context', async () => { + class TestTask extends AgentTask { + constructor() { + super({ instructions: 'test task' }); + } + } + + const task = new TestTask(); + const wrapper = Task.from(async () => { + return await task.run(); + }); + + await expect(wrapper.result).rejects.toThrow( + 'should only be awaited inside function tools or the onEnter/onExit methods of an Agent', + ); + }); + + it('should allow AgentTask run from inline task context', async () => { + class TestTask extends AgentTask { + constructor() { + super({ instructions: 'test task' }); + } + } + + const task = new TestTask(); + const oldAgent = new Agent({ instructions: 'old agent' }); + const mockSession = { + currentAgent: oldAgent, + _globalRunState: undefined, + _updateActivity: async (agent: Agent) => { + if (agent === task) { + task.complete('ok'); + } + }, + }; + + const mockActivity = { + agent: oldAgent, + agentSession: mockSession, + _onEnterTask: undefined, + llm: undefined, + close: async () => {}, + }; + + const wrapper = Task.from(async () => { + const currentTask = Task.current(); + if (!currentTask) { + throw new Error('expected task context'); + } + _setActivityTaskInfo(currentTask, { inlineTask: true }); + return await agentActivityStorage.run(mockActivity as any, () => task.run()); + }); + + await expect(wrapper.result).resolves.toBe('ok'); + }); + + it('should require AgentTask to run inside AgentActivity context', async () => { + class TestTask extends AgentTask { + constructor() { + super({ instructions: 'test task' }); + } + } + + const task = new TestTask(); + const wrapper = Task.from(async () => { + const currentTask = Task.current(); + if (!currentTask) { + throw new Error('expected task context'); + } + _setActivityTaskInfo(currentTask, { inlineTask: true }); + return await task.run(); + }); + + await expect(wrapper.result).rejects.toThrow( + 'must be executed inside an AgentActivity context', + ); + }); + + it('should close old activity when current agent changes while AgentTask is pending', async () => { + class TestTask extends AgentTask { + constructor() { + super({ instructions: 'test task' }); + } + } + + const task = new TestTask(); + const oldAgent = new Agent({ instructions: 'old agent' }); + const switchedAgent = new Agent({ instructions: 'switched agent' }); + const closeOldActivity = vi.fn(async () => {}); + + const mockSession = { + currentAgent: oldAgent as Agent, + _globalRunState: undefined, + _updateActivity: async (agent: Agent) => { + if (agent === task) { + mockSession.currentAgent = switchedAgent; + task.complete('ok'); + } + }, + }; + + const mockActivity = { + agent: oldAgent, + agentSession: mockSession, + _onEnterTask: undefined, + llm: undefined, + close: closeOldActivity, + }; + + const wrapper = Task.from(async () => { + const currentTask = Task.current(); + if (!currentTask) { + throw new Error('expected task context'); + } + _setActivityTaskInfo(currentTask, { inlineTask: true }); + return await agentActivityStorage.run(mockActivity as any, () => task.run()); + }); + + await expect(wrapper.result).resolves.toBe('ok'); + expect(closeOldActivity).toHaveBeenCalledTimes(1); + }); }); diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index 1fb6664c2..630c4e440 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -13,26 +13,71 @@ import { type TTSModelString, } from '../inference/index.js'; import { ReadonlyChatContext } from '../llm/chat_context.js'; -import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js'; +import type { ChatMessage, FunctionCall } from '../llm/index.js'; import { type ChatChunk, ChatContext, LLM, + RealtimeModel, type ToolChoice, type ToolContext, } from '../llm/index.js'; +import { log } from '../log.js'; import type { STT, SpeechEvent } from '../stt/index.js'; import { StreamAdapter as STTStreamAdapter } from '../stt/index.js'; import { SentenceTokenizer as BasicSentenceTokenizer } from '../tokenize/basic/index.js'; import type { TTS } from '../tts/index.js'; import { SynthesizeStream, StreamAdapter as TTSStreamAdapter } from '../tts/index.js'; import { USERDATA_TIMED_TRANSCRIPT } from '../types.js'; +import { Future, Task } from '../utils.js'; import type { VAD } from '../vad.js'; -import type { AgentActivity } from './agent_activity.js'; +import { type AgentActivity, agentActivityStorage } from './agent_activity.js'; import type { AgentSession, TurnDetectionMode } from './agent_session.js'; import type { TimedString } from './io.js'; +import type { SpeechHandle } from './speech_handle.js'; + +export const functionCallStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>(); +export const speechHandleStorage = new AsyncLocalStorage(); +const activityTaskInfoStorage = new WeakMap, _ActivityTaskInfo>(); + +type _ActivityTaskInfo = { + functionCall: FunctionCall | null; + speechHandle: SpeechHandle | null; + inlineTask: boolean; +}; + +/** @internal */ +export function _setActivityTaskInfo( + task: Task, + options: { + functionCall?: FunctionCall | null; + speechHandle?: SpeechHandle | null; + inlineTask?: boolean; + }, +): void { + const info = activityTaskInfoStorage.get(task) ?? { + functionCall: null, + speechHandle: null, + inlineTask: false, + }; + + if (Object.hasOwn(options, 'functionCall')) { + info.functionCall = options.functionCall ?? null; + } + if (Object.hasOwn(options, 'speechHandle')) { + info.speechHandle = options.speechHandle ?? null; + } + if (Object.hasOwn(options, 'inlineTask')) { + info.inlineTask = options.inlineTask ?? false; + } + + activityTaskInfoStorage.set(task, info); +} -export const asyncLocalStorage = new AsyncLocalStorage<{ functionCall?: FunctionCall }>(); +/** @internal */ +export function _getActivityTaskInfo(task: Task): _ActivityTaskInfo | undefined { + return activityTaskInfoStorage.get(task); +} export const STOP_RESPONSE_SYMBOL = Symbol('StopResponse'); export class StopResponse extends Error { @@ -268,20 +313,20 @@ export class Agent { throw new Error('sttNode called but no STT node is available'); } - let wrapped_stt = activity.stt; + let wrappedStt = activity.stt; - if (!wrapped_stt.capabilities.streaming) { + if (!wrappedStt.capabilities.streaming) { const vad = agent.vad || activity.vad; if (!vad) { throw new Error( 'STT does not support streaming, add a VAD to the AgentTask/VoiceAgent to enable streaming', ); } - wrapped_stt = new STTStreamAdapter(wrapped_stt, vad); + wrappedStt = new STTStreamAdapter(wrappedStt, vad); } const connOptions = activity.agentSession.connOptions.sttConnOptions; - const stream = wrapped_stt.stream({ connOptions }); + const stream = wrappedStt.stream({ connOptions }); // Set startTimeOffset to provide linear timestamps across reconnections const audioInputStartedAt = @@ -382,14 +427,14 @@ export class Agent { throw new Error('ttsNode called but no TTS node is available'); } - let wrapped_tts = activity.tts; + let wrappedTts = activity.tts; if (!activity.tts.capabilities.streaming) { - wrapped_tts = new TTSStreamAdapter(wrapped_tts, new BasicSentenceTokenizer()); + wrappedTts = new TTSStreamAdapter(wrappedTts, new BasicSentenceTokenizer()); } const connOptions = activity.agentSession.connOptions.ttsConnOptions; - const stream = wrapped_tts.stream({ connOptions }); + const stream = wrappedTts.stream({ connOptions }); stream.updateInputStream(text); let cleaned = false; @@ -440,3 +485,134 @@ export class Agent { }, }; } + +export class AgentTask extends Agent { + private started = false; + private future = new Future(); + + #logger = log(); + + get done(): boolean { + return this.future.done; + } + + complete(result: ResultT | Error): void { + if (this.future.done) { + throw new Error(`${this.constructor.name} is already done`); + } + + if (result instanceof Error) { + this.future.reject(result); + } else { + this.future.resolve(result); + } + + const speechHandle = speechHandleStorage.getStore(); + if (speechHandle) { + speechHandle._maybeRunFinalOutput = result; + } + } + + async run(_session?: AgentSession): Promise { + if (this.started) { + throw new Error( + `Task ${this.constructor.name} has already started and cannot be awaited multiple times`, + ); + } + this.started = true; + + const currentTask = Task.current(); + if (!currentTask) { + throw new Error(`${this.constructor.name} must be executed inside a Task context`); + } + + const taskInfo = _getActivityTaskInfo(currentTask); + if (!taskInfo || !taskInfo.inlineTask) { + throw new Error( + `${this.constructor.name} should only be awaited inside function tools or the onEnter/onExit methods of an Agent`, + ); + } + + const speechHandle = speechHandleStorage.getStore(); + const oldActivity = agentActivityStorage.getStore(); + if (!oldActivity) { + throw new Error(`${this.constructor.name} must be executed inside an AgentActivity context`); + } + + currentTask.addDoneCallback(() => { + if (this.future.done) return; + + // If the Task finished before the AgentTask was completed, complete the AgentTask with an error. + this.#logger.error(`The Task finished before ${this.constructor.name} was completed.`); + this.complete(new Error(`The Task finished before ${this.constructor.name} was completed.`)); + }); + + const oldAgent = oldActivity.agent; + const session = oldActivity.agentSession; + + const blockedTasks: Task[] = [currentTask]; + const onEnterTask = oldActivity._onEnterTask; + + if (onEnterTask && !onEnterTask.done && onEnterTask !== currentTask) { + blockedTasks.push(onEnterTask); + } + + if ( + taskInfo.functionCall && + oldActivity.llm instanceof RealtimeModel && + !oldActivity.llm.capabilities.manualFunctionCalls + ) { + this.#logger.error( + `Realtime model does not support resuming function calls from chat context, ` + + `using AgentTask inside a function tool may have unexpected behavior.`, + ); + } + + await session._updateActivity(this, { + previousActivity: 'pause', + newActivity: 'start', + blockedTasks, + }); + + // NOTE: _updateActivity is calling the onEnter method, so the RunResult can capture all speeches + let runState = session._globalRunState; + if (speechHandle && runState && !runState.done()) { + // make sure to not deadlock on the current speech handle + runState._unwatchHandle(speechHandle); + // it is OK to call _markDoneIfNeeded here, the above _updateActivity will call onEnter + // so handles added inside the onEnter will make sure we're not completing the runState too early. + runState._markDoneIfNeeded(); + } + + try { + return await this.future.await; + } finally { + // runState could have changed after future resolved + runState = session._globalRunState; + + if (session.currentAgent !== this) { + this.#logger.warn( + `${this.constructor.name} completed, but the agent has changed in the meantime. ` + + `Ignoring handoff to the previous agent, likely due to AgentSession.updateAgent being invoked.`, + ); + await oldActivity.close(); + } else { + if (speechHandle && runState && !runState.done()) { + runState._watchHandle(speechHandle); + } + + const mergedChatCtx = oldAgent._chatCtx.merge(this._chatCtx, { + excludeFunctionCall: true, + excludeInstructions: true, + }); + oldAgent._chatCtx.items = mergedChatCtx.items; + + await session._updateActivity(oldAgent, { + previousActivity: 'close', + newActivity: 'resume', + waitOnEnter: false, + }); + } + } + } +} diff --git a/agents/src/voice/agent_activity.test.ts b/agents/src/voice/agent_activity.test.ts new file mode 100644 index 000000000..26357eea1 --- /dev/null +++ b/agents/src/voice/agent_activity.test.ts @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import { Future, Task } from '../utils.js'; +import { Agent, _setActivityTaskInfo } from './agent.js'; +import { AgentActivity } from './agent_activity.js'; +import { ToolExecutionOutput } from './generation.js'; +import { SpeechHandle } from './speech_handle.js'; + +initializeLogger({ pretty: false, level: 'error' }); + +function createActivityForTests(): AgentActivity { + const agent = new Agent({ instructions: 'test agent' }); + const sessionMock = { + options: { + allowInterruptions: true, + discardAudioIfUninterruptible: true, + minInterruptionDuration: 500, + minInterruptionWords: 0, + minEndpointingDelay: 500, + maxEndpointingDelay: 6000, + maxToolSteps: 3, + preemptiveGeneration: false, + userAwayTimeout: 15, + useTtsAlignedTranscript: true, + }, + turnDetection: undefined, + vad: undefined, + stt: undefined, + llm: undefined, + tts: undefined, + output: { + audio: null, + audioEnabled: false, + }, + rootSpanContext: undefined, + useTtsAlignedTranscript: true, + agentState: 'listening', + emit: vi.fn(), + _updateAgentState: vi.fn(), + _conversationItemAdded: vi.fn(), + _toolItemsAdded: vi.fn(), + updateAgent: vi.fn(), + }; + + return new AgentActivity(agent, sessionMock as any); +} + +describe('AgentActivity parity behaviors', () => { + it('summarizes tool outputs with symmetric function call metadata', () => { + const activity = createActivityForTests(); + const speechHandle = SpeechHandle.create(); + + const toolCall = FunctionCall.create({ + callId: 'call_1', + name: 'lookup', + args: JSON.stringify({ city: 'SF' }), + }); + const toolCallOutput = FunctionCallOutput.create({ + callId: 'call_1', + name: 'lookup', + output: 'sunny', + isError: false, + }); + + const toolOutput = { + output: [ + ToolExecutionOutput.create({ + toolCall, + toolCallOutput, + rawOutput: 'sunny', + replyRequired: true, + }), + ], + firstToolStartedFuture: new Future(), + }; + + const summary = (activity as any).summarizeToolExecutionOutput(toolOutput, speechHandle); + expect(summary.functionToolsExecutedEvent.functionCalls).toHaveLength(1); + expect(summary.functionToolsExecutedEvent.functionCallOutputs).toHaveLength(1); + expect(summary.shouldGenerateToolReply).toBe(true); + expect(summary.newAgentTask).toBeNull(); + expect(summary.ignoreTaskSwitch).toBe(false); + }); + + it('blocks scheduleSpeech while scheduling is paused unless force=true', () => { + const activity = createActivityForTests(); + const handle = SpeechHandle.create(); + + (activity as any)._schedulingPaused = true; + + expect(() => + (activity as any).scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL), + ).toThrow('cannot schedule new speech, the speech scheduling is draining/pausing'); + + expect(() => + (activity as any).scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true), + ).not.toThrow(); + }); + + it('filters drain pending tasks by blocked speech handles', async () => { + const activity = createActivityForTests(); + const gate = new Future(); + + const blockedSpeechHandle = SpeechHandle.create(); + const siblingSpeechHandle = blockedSpeechHandle; + + const blockedTask = Task.from(async () => { + await gate.await; + }); + const siblingTask = Task.from(async () => { + await gate.await; + }); + + _setActivityTaskInfo(blockedTask, { speechHandle: blockedSpeechHandle }); + _setActivityTaskInfo(siblingTask, { speechHandle: siblingSpeechHandle }); + + (activity as any).speechTasks = new Set([blockedTask, siblingTask]); + (activity as any)._drainBlockedTasks = [blockedTask]; + (activity as any)._schedulingPaused = true; + + const toWait = (activity as any).getDrainPendingSpeechTasks() as Task[]; + expect(toWait).toEqual([]); + + gate.resolve(); + await Promise.allSettled([blockedTask.result, siblingTask.result]); + }); + + it('interrupt cancels preemptive generation first', () => { + const activity = createActivityForTests(); + const preemptiveSpeech = SpeechHandle.create(); + + (activity as any)._preemptiveGeneration = { speechHandle: preemptiveSpeech } as any; + + const fut = activity.interrupt(); + + expect(preemptiveSpeech.interrupted).toBe(true); + expect((activity as any)._preemptiveGeneration).toBeUndefined(); + expect(fut.done).toBe(true); + }); +}); diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index db16674e5..656bb0a57 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -43,7 +43,13 @@ import { TTS, type TTSError } from '../tts/tts.js'; import { Future, Task, cancelAndWait, waitFor } from '../utils.js'; import { VAD, type VADEvent } from '../vad.js'; import type { Agent, ModelSettings } from './agent.js'; -import { StopResponse, asyncLocalStorage } from './agent.js'; +import { + StopResponse, + _getActivityTaskInfo, + _setActivityTaskInfo, + functionCallStorage, + speechHandleStorage, +} from './agent.js'; import { type AgentSession, type TurnDetectionMode } from './agent_session.js'; import { AudioRecognition, @@ -60,7 +66,7 @@ import { createSpeechCreatedEvent, createUserInputTranscribedEvent, } from './events.js'; -import type { ToolExecutionOutput, _TTSGenerationData } from './generation.js'; +import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './generation.js'; import { type _AudioOut, type _TextOut, @@ -75,7 +81,7 @@ import { import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; -const speechHandleStorage = new AsyncLocalStorage(); +export const agentActivityStorage = new AsyncLocalStorage(); interface PreemptiveGeneration { speechHandle: SpeechHandle; @@ -88,14 +94,19 @@ interface PreemptiveGeneration { } export class AgentActivity implements RecognitionHooks { + agent: Agent; + agentSession: AgentSession; + private static readonly REPLY_TASK_CANCEL_TIMEOUT = 5000; + private started = false; private audioRecognition?: AudioRecognition; private realtimeSession?: RealtimeSession; private realtimeSpans?: Map; // Maps response_id to OTEL span for metrics recording private turnDetectionMode?: Exclude; private logger = log(); - private _draining = false; + private _schedulingPaused = true; + private _drainBlockedTasks: Task[] = []; private _currentSpeech?: SpeechHandle; private speechQueue: Heap<[number, number, SpeechHandle]>; // [priority, timestamp, speechHandle] private q_updated: Future; @@ -106,13 +117,23 @@ export class AgentActivity implements RecognitionHooks { private toolChoice: ToolChoice | null = null; private _preemptiveGeneration?: PreemptiveGeneration; - agent: Agent; - agentSession: AgentSession; - /** @internal */ _mainTask?: Task; - _userTurnCompletedTask?: Promise; - + _onEnterTask?: Task; + _onExitTask?: Task; + _userTurnCompletedTask?: Task; + + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 703-739 lines. + private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) => + this.onGenerationCreated(ev); + private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) => + this.onInputSpeechStarted(ev); + private readonly onRealtimeInputSpeechStopped = (ev: InputSpeechStoppedEvent) => + this.onInputSpeechStopped(ev); + private readonly onRealtimeInputAudioTranscriptionCompleted = (ev: InputTranscriptionCompleted) => + this.onInputAudioTranscriptionCompleted(ev); + private readonly onModelError = (ev: RealtimeModelError | STTError | TTSError | LLMError) => + this.onError(ev); constructor(agent: Agent, agentSession: AgentSession) { this.agent = agent; this.agentSession = agentSession; @@ -132,7 +153,7 @@ export class AgentActivity implements RecognitionHooks { if (this.turnDetectionMode === 'vad' && this.vad === undefined) { this.logger.warn( - 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDdetection setting', + 'turnDetection is set to "vad", but no VAD model is provided, ignoring the turnDetection setting', ); this.turnDetectionMode = undefined; } @@ -210,117 +231,135 @@ export class AgentActivity implements RecognitionHooks { async start(): Promise { const unlock = await this.lock.lock(); try { - // Create start_agent_activity as a ROOT span (new trace) to match Python behavior - const startSpan = tracer.startSpan({ - name: 'start_agent_activity', - attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, - context: ROOT_CONTEXT, - }); + await this._startSession({ spanName: 'start_agent_activity', runOnEnter: true }); + } finally { + unlock(); + } + } - this.agent._agentActivity = this; + async resume(): Promise { + const unlock = await this.lock.lock(); + try { + await this._startSession({ spanName: 'resume_agent_activity', runOnEnter: false }); + } finally { + unlock(); + } + } - if (this.llm instanceof RealtimeModel) { - this.realtimeSession = this.llm.session(); - this.realtimeSpans = new Map(); - this.realtimeSession.on('generation_created', (ev) => this.onGenerationCreated(ev)); - this.realtimeSession.on('input_speech_started', (ev) => this.onInputSpeechStarted(ev)); - this.realtimeSession.on('input_speech_stopped', (ev) => this.onInputSpeechStopped(ev)); - this.realtimeSession.on('input_audio_transcription_completed', (ev) => - this.onInputAudioTranscriptionCompleted(ev), - ); - this.realtimeSession.on('metrics_collected', (ev) => this.onMetricsCollected(ev)); - this.realtimeSession.on('error', (ev) => this.onError(ev)); - - removeInstructions(this.agent._chatCtx); - try { - await this.realtimeSession.updateInstructions(this.agent.instructions); - } catch (error) { - this.logger.error(error, 'failed to update the instructions'); - } + private async _startSession(options: { + spanName: 'start_agent_activity' | 'resume_agent_activity'; + runOnEnter: boolean; + }): Promise { + const { spanName, runOnEnter } = options; + const startSpan = tracer.startSpan({ + name: spanName, + attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, + context: ROOT_CONTEXT, + }); - try { - await this.realtimeSession.updateChatCtx(this.agent.chatCtx); - } catch (error) { - this.logger.error(error, 'failed to update the chat context'); - } + this.agent._agentActivity = this; - try { - await this.realtimeSession.updateTools(this.tools); - } catch (error) { - this.logger.error(error, 'failed to update the tools'); - } + if (this.llm instanceof RealtimeModel) { + this.realtimeSession = this.llm.session(); + this.realtimeSpans = new Map(); + this.realtimeSession.on('generation_created', this.onRealtimeGenerationCreated); + this.realtimeSession.on('input_speech_started', this.onRealtimeInputSpeechStarted); + this.realtimeSession.on('input_speech_stopped', this.onRealtimeInputSpeechStopped); + this.realtimeSession.on( + 'input_audio_transcription_completed', + this.onRealtimeInputAudioTranscriptionCompleted, + ); + this.realtimeSession.on('metrics_collected', this.onMetricsCollected); + this.realtimeSession.on('error', this.onModelError); - if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) { - this.logger.error( - 'audio output is enabled but RealtimeModel has no audio modality ' + - 'and no TTS is set. Either enable audio modality in the RealtimeModel ' + - 'or set a TTS model.', - ); - } - } else if (this.llm instanceof LLM) { - try { - updateInstructions({ - chatCtx: this.agent._chatCtx, - instructions: this.agent.instructions, - addIfMissing: true, - }); - } catch (error) { - this.logger.error('failed to update the instructions', error); - } + removeInstructions(this.agent._chatCtx); + try { + await this.realtimeSession.updateInstructions(this.agent.instructions); + } catch (error) { + this.logger.error(error, 'failed to update the instructions'); } - // metrics and error handling - if (this.llm instanceof LLM) { - this.llm.on('metrics_collected', (ev) => this.onMetricsCollected(ev)); - this.llm.on('error', (ev) => this.onError(ev)); + try { + await this.realtimeSession.updateChatCtx(this.agent.chatCtx); + } catch (error) { + this.logger.error(error, 'failed to update the chat context'); } - if (this.stt instanceof STT) { - this.stt.on('metrics_collected', (ev) => this.onMetricsCollected(ev)); - this.stt.on('error', (ev) => this.onError(ev)); + try { + await this.realtimeSession.updateTools(this.tools); + } catch (error) { + this.logger.error(error, 'failed to update the tools'); } - if (this.tts instanceof TTS) { - this.tts.on('metrics_collected', (ev) => this.onMetricsCollected(ev)); - this.tts.on('error', (ev) => this.onError(ev)); + if (!this.llm.capabilities.audioOutput && !this.tts && this.agentSession.output.audio) { + this.logger.error( + 'audio output is enabled but RealtimeModel has no audio modality ' + + 'and no TTS is set. Either enable audio modality in the RealtimeModel ' + + 'or set a TTS model.', + ); } - - if (this.vad instanceof VAD) { - this.vad.on('metrics_collected', (ev) => this.onMetricsCollected(ev)); + } else if (this.llm instanceof LLM) { + try { + updateInstructions({ + chatCtx: this.agent._chatCtx, + instructions: this.agent.instructions, + addIfMissing: true, + }); + } catch (error) { + this.logger.error('failed to update the instructions', error); } + } - this.audioRecognition = new AudioRecognition({ - recognitionHooks: this, - // Disable stt node if stt is not provided - stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined, - vad: this.vad, - turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, - turnDetectionMode: this.turnDetectionMode, - minEndpointingDelay: this.agentSession.options.minEndpointingDelay, - maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, - rootSpanContext: this.agentSession.rootSpanContext, - }); - this.audioRecognition.start(); - this.started = true; + // metrics and error handling + if (this.llm instanceof LLM) { + this.llm.on('metrics_collected', this.onMetricsCollected); + this.llm.on('error', this.onModelError); + } - this._mainTask = Task.from(({ signal }) => this.mainTask(signal)); + if (this.stt instanceof STT) { + this.stt.on('metrics_collected', this.onMetricsCollected); + this.stt.on('error', this.onModelError); + } - // Create on_enter as a child of start_agent_activity in the new trace - const onEnterTask = tracer.startActiveSpan(async () => this.agent.onEnter(), { - name: 'on_enter', - context: trace.setSpan(ROOT_CONTEXT, startSpan), - attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, - }); + if (this.tts instanceof TTS) { + this.tts.on('metrics_collected', this.onMetricsCollected); + this.tts.on('error', this.onModelError); + } - this.createSpeechTask({ - task: Task.from(() => onEnterTask), + if (this.vad instanceof VAD) { + this.vad.on('metrics_collected', this.onMetricsCollected); + } + + this.audioRecognition = new AudioRecognition({ + recognitionHooks: this, + // Disable stt node if stt is not provided + stt: this.stt ? (...args) => this.agent.sttNode(...args) : undefined, + vad: this.vad, + turnDetector: typeof this.turnDetection === 'string' ? undefined : this.turnDetection, + turnDetectionMode: this.turnDetectionMode, + minEndpointingDelay: this.agentSession.options.minEndpointingDelay, + maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, + rootSpanContext: this.agentSession.rootSpanContext, + }); + this.audioRecognition.start(); + this.started = true; + + this._resumeSchedulingTask(); + + if (runOnEnter) { + this._onEnterTask = this.createSpeechTask({ + taskFn: () => + tracer.startActiveSpan(async () => this.agent.onEnter(), { + name: 'on_enter', + context: trace.setSpan(ROOT_CONTEXT, startSpan), + attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, + }), + inlineTask: true, name: 'AgentActivity_onEnter', }); - - startSpan.end(); - } finally { - unlock(); } + + startSpan.end(); } get currentSpeech(): SpeechHandle | undefined { @@ -347,8 +386,8 @@ export class AgentActivity implements RecognitionHooks { return this.agent.toolCtx; } - get draining(): boolean { - return this._draining; + get schedulingPaused(): boolean { + return this._schedulingPaused; } get realtimeLLMSession(): RealtimeSession | undefined { @@ -429,13 +468,20 @@ export class AgentActivity implements RecognitionHooks { this.audioStream.detachSource(); } - commitUserTurn() { + commitUserTurn( + options: { + audioDetached?: boolean; + throwIfNotReady?: boolean; + } = {}, + ) { + const { audioDetached = false, throwIfNotReady = true } = options; if (!this.audioRecognition) { - throw new Error('AudioRecognition is not initialized'); + if (throwIfNotReady) { + throw new Error('AudioRecognition is not initialized'); + } + return; } - // TODO(brian): add audio_detached flag - const audioDetached = false; this.audioRecognition.commitUserTurn(audioDetached); } @@ -493,14 +539,13 @@ export class AgentActivity implements RecognitionHooks { }), ); const task = this.createSpeechTask({ - task: Task.from((abortController: AbortController) => + taskFn: (abortController: AbortController) => this.ttsTask(handle, text, addToChatCtx, {}, abortController, audio), - ), ownedSpeechHandle: handle, name: 'AgentActivity.say_tts', }); - task.finally(() => this.onPipelineReplyDone()); + task.result.finally(() => this.onPipelineReplyDone()); this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL); return handle; } @@ -613,9 +658,9 @@ export class AgentActivity implements RecognitionHooks { return; } - if (this.draining) { + if (this.schedulingPaused) { // TODO(shubhra): should we "forward" this new turn to the next agent? - this.logger.warn('skipping new realtime generation, the agent is draining'); + this.logger.warn('skipping new realtime generation, the speech scheduling is not running'); return; } @@ -633,9 +678,8 @@ export class AgentActivity implements RecognitionHooks { this.logger.info({ speech_id: handle.id }, 'Creating speech handle'); this.createSpeechTask({ - task: Task.from((abortController: AbortController) => + taskFn: (abortController: AbortController) => this.realtimeGenerationTask(handle, ev, {}, abortController), - ), ownedSpeechHandle: handle, name: 'AgentActivity.realtimeGeneration', }); @@ -767,7 +811,7 @@ export class AgentActivity implements RecognitionHooks { onPreemptiveGeneration(info: PreemptiveGenerationInfo): void { if ( !this.agentSession.options.preemptiveGeneration || - this.draining || + this.schedulingPaused || (this._currentSpeech !== undefined && !this._currentSpeech.interrupted) || !(this.llm instanceof LLM) ) { @@ -814,11 +858,25 @@ export class AgentActivity implements RecognitionHooks { } private createSpeechTask(options: { - task: Task; + taskFn: (controller: AbortController) => Promise; + controller?: AbortController; ownedSpeechHandle?: SpeechHandle; + inlineTask?: boolean; name?: string; - }): Promise { - const { task, ownedSpeechHandle } = options; + }): Task { + const { taskFn, controller, ownedSpeechHandle, inlineTask, name } = options; + + const wrappedFn = (ctrl: AbortController) => { + return agentActivityStorage.run(this, () => { + if (ownedSpeechHandle) { + return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl)); + } + return taskFn(ctrl); + }); + }; + + const task = Task.from(wrappedFn, controller, name); + _setActivityTaskInfo(task, { speechHandle: ownedSpeechHandle, inlineTask }); this.speechTasks.add(task); task.addDoneCallback(() => { @@ -838,13 +896,16 @@ export class AgentActivity implements RecognitionHooks { this.wakeupMainTask(); }); - return task.result; + return task; } async onEndOfTurn(info: EndOfTurnInfo): Promise { - if (this.draining) { + if (this.schedulingPaused) { this.cancelPreemptiveGeneration(); - this.logger.warn({ user_input: info.newTranscript }, 'skipping user input, task is draining'); + this.logger.warn( + { user_input: info.newTranscript }, + 'skipping user input, speech scheduling is paused', + ); // TODO(shubhra): should we "forward" this new turn to the next agent/activity? return true; } @@ -877,7 +938,7 @@ export class AgentActivity implements RecognitionHooks { const oldTask = this._userTurnCompletedTask; this._userTurnCompletedTask = this.createSpeechTask({ - task: Task.from(() => this.userTurnCompleted(info, oldTask)), + taskFn: () => this.userTurnCompleted(info, oldTask), name: 'AgentActivity.userTurnCompleted', }); return true; @@ -913,10 +974,12 @@ export class AgentActivity implements RecognitionHooks { this._currentSpeech = undefined; } - // If we're draining and there are no more speech tasks, we can exit. - // Only speech tasks can bypass draining to create a tool response - if (this.draining && this.speechTasks.size === 0) { - this.logger.info('mainTask: draining and no more speech tasks'); + // if we're draining/pausing and there are no more speech tasks, we can exit. + // only speech tasks can bypass draining to create a tool response (see scheduleSpeech) + const toWait = this.getDrainPendingSpeechTasks(); + + if (this._schedulingPaused && toWait.length === 0) { + this.logger.info('mainTask: scheduling paused and no more speech tasks to wait'); break; } @@ -926,6 +989,39 @@ export class AgentActivity implements RecognitionHooks { this.logger.info('AgentActivity mainTask: exiting'); } + private getDrainPendingSpeechTasks(): Task[] { + const blockedHandles: SpeechHandle[] = []; + + for (const task of this._drainBlockedTasks) { + const info = _getActivityTaskInfo(task); + if (!info) { + this.logger.error('blocked task without activity info; skipping.'); + continue; + } + + if (!info.speechHandle) { + continue; // onEnter/onExit + } + + blockedHandles.push(info.speechHandle); + } + + const toWait: Task[] = []; + for (const task of this.speechTasks) { + if (this._drainBlockedTasks.includes(task)) { + continue; + } + + const info = _getActivityTaskInfo(task); + if (info && info.speechHandle && blockedHandles.includes(info.speechHandle)) { + continue; + } + + toWait.push(task); + } + return toWait; + } + private wakeupMainTask(): void { this.q_updated.resolve(); } @@ -967,7 +1063,7 @@ export class AgentActivity implements RecognitionHooks { throw new Error('trying to generate reply without an LLM model'); } - const functionCall = asyncLocalStorage.getStore()?.functionCall; + const functionCall = functionCallStorage.getStore()?.functionCall; if (toolChoice === undefined && functionCall !== undefined) { // when generateReply is called inside a tool, set toolChoice to 'none' by default toolChoice = 'none'; @@ -989,7 +1085,7 @@ export class AgentActivity implements RecognitionHooks { if (this.llm instanceof RealtimeModel) { this.createSpeechTask({ - task: Task.from((abortController: AbortController) => + taskFn: (abortController: AbortController) => this.realtimeReplyTask({ speechHandle: handle, // TODO(brian): support llm.ChatMessage for the realtime model @@ -1001,7 +1097,6 @@ export class AgentActivity implements RecognitionHooks { }, abortController, }), - ), ownedSpeechHandle: handle, name: 'AgentActivity.realtimeReply', }); @@ -1014,7 +1109,7 @@ export class AgentActivity implements RecognitionHooks { } const task = this.createSpeechTask({ - task: Task.from((abortController: AbortController) => + taskFn: (abortController: AbortController) => this.pipelineReplyTask( handle, chatCtx ?? this.agent.chatCtx, @@ -1026,12 +1121,11 @@ export class AgentActivity implements RecognitionHooks { instructions, userMessage, ), - ), ownedSpeechHandle: handle, name: 'AgentActivity.pipelineReply', }); - task.finally(() => this.onPipelineReplyDone()); + task.result.finally(() => this.onPipelineReplyDone()); } if (scheduleSpeech) { @@ -1040,16 +1134,19 @@ export class AgentActivity implements RecognitionHooks { return handle; } - interrupt(): Future { + interrupt(options: { force?: boolean } = {}): Future { + const { force = false } = options; + this.cancelPreemptiveGeneration(); + const future = new Future(); const currentSpeech = this._currentSpeech; //TODO(AJS-273): add interrupt for background speeches - currentSpeech?.interrupt(); + currentSpeech?.interrupt(force); for (const [_, __, speech] of this.speechQueue) { - speech.interrupt(); + speech.interrupt(force); } this.realtimeSession?.interrupt(); @@ -1072,13 +1169,13 @@ export class AgentActivity implements RecognitionHooks { } } - private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Promise): Promise { + private async userTurnCompleted(info: EndOfTurnInfo, oldTask?: Task): Promise { if (oldTask) { // We never cancel user code as this is very confusing. // So we wait for the old execution of onUserTurnCompleted to finish. // In practice this is OK because most speeches will be interrupted if a new turn // is detected. So the previous execution should complete quickly. - await oldTask; + await oldTask.result; } // When the audio recognition detects the end of a user turn: @@ -1645,52 +1742,18 @@ export class AgentActivity implements RecognitionHooks { return; } - const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({ - functionCalls: [], - functionCallOutputs: [], - }); - let shouldGenerateToolReply: boolean = false; - let newAgentTask: Agent | null = null; - let ignoreTaskSwitch: boolean = false; - - for (const sanitizedOut of toolOutput.output) { - if (sanitizedOut.toolCallOutput !== undefined) { - functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall); - functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput); - if (sanitizedOut.replyRequired) { - shouldGenerateToolReply = true; - } - } - - if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) { - this.logger.error('expected to receive only one agent task from the tool executions'); - ignoreTaskSwitch = true; - // TODO(brian): should we mark the function call as failed to notify the LLM? - } - - newAgentTask = sanitizedOut.agentTask ?? null; - - this.logger.debug( - { - speechId: speechHandle.id, - name: sanitizedOut.toolCall?.name, - args: sanitizedOut.toolCall.args, - output: sanitizedOut.toolCallOutput?.output, - isError: sanitizedOut.toolCallOutput?.isError, - }, - 'Tool call execution finished', - ); - } + const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = + this.summarizeToolExecutionOutput(toolOutput, speechHandle); this.agentSession.emit( AgentSessionEventTypes.FunctionToolsExecuted, functionToolsExecutedEvent, ); - let draining = this.draining; + let schedulingPaused = this.schedulingPaused; if (!ignoreTaskSwitch && newAgentTask !== null) { this.agentSession.updateAgent(newAgentTask); - draining = true; + schedulingPaused = true; } const toolMessages = [ @@ -1705,11 +1768,12 @@ export class AgentActivity implements RecognitionHooks { // Avoid setting tool_choice to "required" or a specific function when // passing tool response back to the LLM - const respondToolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto'; + const respondToolChoice = + schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto'; // Reuse same speechHandle for tool response (parity with Python agent_activity.py L2122-2140) const toolResponseTask = this.createSpeechTask({ - task: Task.from(() => + taskFn: () => this.pipelineReplyTask( speechHandle, chatCtx, @@ -1720,12 +1784,11 @@ export class AgentActivity implements RecognitionHooks { undefined, toolMessages, ), - ), ownedSpeechHandle: speechHandle, name: 'AgentActivity.pipelineReply', }); - toolResponseTask.finally(() => this.onPipelineReplyDone()); + toolResponseTask.result.finally(() => this.onPipelineReplyDone()); this.scheduleSpeech(speechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true); } else if (functionToolsExecutedEvent.functionCallOutputs.length > 0) { @@ -2139,50 +2202,18 @@ export class AgentActivity implements RecognitionHooks { return; } - const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({ - functionCalls: [], - functionCallOutputs: [], - }); - let shouldGenerateToolReply: boolean = false; - let newAgentTask: Agent | null = null; - let ignoreTaskSwitch: boolean = false; - - for (const sanitizedOut of toolOutput.output) { - if (sanitizedOut.toolCallOutput !== undefined) { - functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput); - if (sanitizedOut.replyRequired) { - shouldGenerateToolReply = true; - } - } - - if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) { - this.logger.error('expected to receive only one agent task from the tool executions'); - ignoreTaskSwitch = true; - } - - newAgentTask = sanitizedOut.agentTask ?? null; - - this.logger.debug( - { - speechId: speechHandle.id, - name: sanitizedOut.toolCall?.name, - args: sanitizedOut.toolCall.args, - output: sanitizedOut.toolCallOutput?.output, - isError: sanitizedOut.toolCallOutput?.isError, - }, - 'Tool call execution finished', - ); - } + const { functionToolsExecutedEvent, shouldGenerateToolReply, newAgentTask, ignoreTaskSwitch } = + this.summarizeToolExecutionOutput(toolOutput, speechHandle); this.agentSession.emit( AgentSessionEventTypes.FunctionToolsExecuted, functionToolsExecutedEvent, ); - let draining = this.draining; + let schedulingPaused = this.schedulingPaused; if (!ignoreTaskSwitch && newAgentTask !== null) { this.agentSession.updateAgent(newAgentTask); - draining = true; + schedulingPaused = true; } if (functionToolsExecutedEvent.functionCallOutputs.length > 0) { @@ -2238,15 +2269,14 @@ export class AgentActivity implements RecognitionHooks { }), ); - const toolChoice = draining || modelSettings.toolChoice === 'none' ? 'none' : 'auto'; + const toolChoice = schedulingPaused || modelSettings.toolChoice === 'none' ? 'none' : 'auto'; this.createSpeechTask({ - task: Task.from((abortController: AbortController) => + taskFn: (abortController: AbortController) => this.realtimeReplyTask({ speechHandle: replySpeechHandle, modelSettings: { toolChoice }, abortController, }), - ), ownedSpeechHandle: replySpeechHandle, name: 'AgentActivity.realtime_reply', }); @@ -2254,6 +2284,53 @@ export class AgentActivity implements RecognitionHooks { this.scheduleSpeech(replySpeechHandle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true); } + private summarizeToolExecutionOutput(toolOutput: ToolOutput, speechHandle: SpeechHandle) { + const functionToolsExecutedEvent = createFunctionToolsExecutedEvent({ + functionCalls: [], + functionCallOutputs: [], + }); + + let shouldGenerateToolReply = false; + let newAgentTask: Agent | null = null; + let ignoreTaskSwitch = false; + + for (const sanitizedOut of toolOutput.output) { + if (sanitizedOut.toolCallOutput !== undefined) { + // Keep event payload symmetric for pipeline + realtime paths. + functionToolsExecutedEvent.functionCalls.push(sanitizedOut.toolCall); + functionToolsExecutedEvent.functionCallOutputs.push(sanitizedOut.toolCallOutput); + if (sanitizedOut.replyRequired) { + shouldGenerateToolReply = true; + } + } + + if (newAgentTask !== null && sanitizedOut.agentTask !== undefined) { + this.logger.error('expected to receive only one agent task from the tool executions'); + ignoreTaskSwitch = true; + } + + newAgentTask = sanitizedOut.agentTask ?? null; + + this.logger.debug( + { + speechId: speechHandle.id, + name: sanitizedOut.toolCall?.name, + args: sanitizedOut.toolCall.args, + output: sanitizedOut.toolCallOutput?.output, + isError: sanitizedOut.toolCallOutput?.isError, + }, + 'Tool call execution finished', + ); + } + + return { + functionToolsExecutedEvent, + shouldGenerateToolReply, + newAgentTask, + ignoreTaskSwitch, + }; + } + private async realtimeReplyTask({ speechHandle, modelSettings: { toolChoice }, @@ -2312,10 +2389,10 @@ export class AgentActivity implements RecognitionHooks { priority: number, force: boolean = false, ): void { - // when force=true, we allow tool responses to bypass draining + // when force=true, we allow tool responses to bypass scheduling pause // This allows for tool responses to be generated before the AgentActivity is finalized - if (this.draining && !force) { - throw new Error('cannot schedule new speech, the agent is draining'); + if (this.schedulingPaused && !force) { + throw new Error('cannot schedule new speech, the speech scheduling is draining/pausing'); } // Monotonic time to avoid near 0 collisions @@ -2324,6 +2401,48 @@ export class AgentActivity implements RecognitionHooks { this.wakeupMainTask(); } + private async _pauseSchedulingTask(blockedTasks: Task[]): Promise { + if (this._schedulingPaused) return; + + this._schedulingPaused = true; + this._drainBlockedTasks = blockedTasks; + this.wakeupMainTask(); + + if (this._mainTask) { + // When pausing/draining, we ensure that all speech_tasks complete fully. + // This means that even if the SpeechHandle themselves have finished, + // we still wait for the entire execution (e.g function_tools) + await this._mainTask.result; + } + } + + private _resumeSchedulingTask(): void { + if (!this._schedulingPaused) return; + + this._schedulingPaused = false; + this._mainTask = Task.from(({ signal }) => this.mainTask(signal)); + } + + async pause(options: { blockedTasks?: Task[] } = {}): Promise { + const { blockedTasks = [] } = options; + const unlock = await this.lock.lock(); + + try { + const span = tracer.startSpan({ + name: 'pause_agent_activity', + attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, + }); + try { + await this._pauseSchedulingTask(blockedTasks); + await this._closeSessionResources(); + } finally { + span.end(); + } + } finally { + unlock(); + } + } + async drain(): Promise { // Create drain_agent_activity as a ROOT span (new trace) to match Python behavior return tracer.startActiveSpan(async (span) => this._drainImpl(span), { @@ -2337,23 +2456,23 @@ export class AgentActivity implements RecognitionHooks { const unlock = await this.lock.lock(); try { - if (this._draining) return; - - this.cancelPreemptiveGeneration(); - - const onExitTask = tracer.startActiveSpan(async () => this.agent.onExit(), { - name: 'on_exit', - attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, - }); - - this.createSpeechTask({ - task: Task.from(() => onExitTask), + if (this._schedulingPaused) return; + + // Ref: python agent_activity.py 629-632 + this._onExitTask = this.createSpeechTask({ + taskFn: () => + tracer.startActiveSpan(async () => this.agent.onExit(), { + name: 'on_exit', + attributes: { [traceTypes.ATTR_AGENT_LABEL]: this.agent.id }, + }), + inlineTask: true, name: 'AgentActivity_onExit', }); - this.wakeupMainTask(); - this._draining = true; - await this._mainTask?.result; + this.cancelPreemptiveGeneration(); + + await this._onExitTask.result; + await this._pauseSchedulingTask([]); } finally { unlock(); } @@ -2362,44 +2481,59 @@ export class AgentActivity implements RecognitionHooks { async close(): Promise { const unlock = await this.lock.lock(); try { - if (!this._draining) { - this.logger.warn('task closing without draining'); - } - this.cancelPreemptiveGeneration(); - // Unregister event handlers to prevent duplicate metrics - if (this.llm instanceof LLM) { - this.llm.off('metrics_collected', this.onMetricsCollected); - } - if (this.realtimeSession) { - this.realtimeSession.off('generation_created', this.onGenerationCreated); - this.realtimeSession.off('input_speech_started', this.onInputSpeechStarted); - this.realtimeSession.off('input_speech_stopped', this.onInputSpeechStopped); - this.realtimeSession.off( - 'input_audio_transcription_completed', - this.onInputAudioTranscriptionCompleted, - ); - this.realtimeSession.off('metrics_collected', this.onMetricsCollected); - } - if (this.stt instanceof STT) { - this.stt.off('metrics_collected', this.onMetricsCollected); - } - if (this.tts instanceof TTS) { - this.tts.off('metrics_collected', this.onMetricsCollected); - } - if (this.vad instanceof VAD) { - this.vad.off('metrics_collected', this.onMetricsCollected); + await this._closeSessionResources(); + + if (this._mainTask) { + await this._mainTask.cancelAndWait(); } - this.detachAudioInput(); - this.realtimeSpans?.clear(); - await this.realtimeSession?.close(); - await this.audioRecognition?.close(); - await this._mainTask?.cancelAndWait(); + this.agent._agentActivity = undefined; } finally { unlock(); } } + + private async _closeSessionResources(): Promise { + // Unregister event handlers to prevent duplicate metrics + if (this.llm instanceof LLM) { + this.llm.off('metrics_collected', this.onMetricsCollected); + this.llm.off('error', this.onModelError); + } + + if (this.realtimeSession) { + this.realtimeSession.off('generation_created', this.onRealtimeGenerationCreated); + this.realtimeSession.off('input_speech_started', this.onRealtimeInputSpeechStarted); + this.realtimeSession.off('input_speech_stopped', this.onRealtimeInputSpeechStopped); + this.realtimeSession.off( + 'input_audio_transcription_completed', + this.onRealtimeInputAudioTranscriptionCompleted, + ); + this.realtimeSession.off('metrics_collected', this.onMetricsCollected); + this.realtimeSession.off('error', this.onModelError); + } + + if (this.stt instanceof STT) { + this.stt.off('metrics_collected', this.onMetricsCollected); + this.stt.off('error', this.onModelError); + } + + if (this.tts instanceof TTS) { + this.tts.off('metrics_collected', this.onMetricsCollected); + this.tts.off('error', this.onModelError); + } + + if (this.vad instanceof VAD) { + this.vad.off('metrics_collected', this.onMetricsCollected); + } + + this.detachAudioInput(); + this.realtimeSpans?.clear(); + await this.realtimeSession?.close(); + await this.audioRecognition?.close(); + this.realtimeSession = undefined; + this.audioRecognition = undefined; + } } function toOaiToolChoice(toolChoice: ToolChoice | null): ToolChoice | undefined { diff --git a/agents/src/voice/agent_session.test.ts b/agents/src/voice/agent_session.test.ts new file mode 100644 index 000000000..aa3c664a8 --- /dev/null +++ b/agents/src/voice/agent_session.test.ts @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { initializeLogger } from '../log.js'; +import { Future, Task } from '../utils.js'; +import { Agent } from './agent.js'; +import { AgentActivity } from './agent_activity.js'; +import { AgentSession } from './agent_session.js'; +import { CloseReason } from './events.js'; +import { SpeechHandle } from './speech_handle.js'; + +initializeLogger({ pretty: false, level: 'error' }); + +describe('AgentSession', () => { + it('serializes updateAgent transitions and watches run-state tasks', async () => { + const session = new AgentSession({}); + const agent1 = new Agent({ instructions: 'agent one' }); + const agent2 = new Agent({ instructions: 'agent two' }); + + (session as any).started = true; + const order: string[] = []; + + let firstCall = true; + (session as any)._updateActivity = vi.fn(async (agent: Agent) => { + order.push(`start:${agent.id}`); + if (firstCall) { + firstCall = false; + await new Promise((resolve) => setTimeout(resolve, 20)); + } + order.push(`end:${agent.id}`); + }); + + const watchHandle = vi.fn(); + (session as any)._globalRunState = { _watchHandle: watchHandle }; + + session.updateAgent(agent1); + session.updateAgent(agent2); + + await ((session as any).updateActivityTask as { result: Promise }).result; + + expect(order).toEqual([ + `start:${agent1.id}`, + `end:${agent1.id}`, + `start:${agent2.id}`, + `end:${agent2.id}`, + ]); + expect(watchHandle).toHaveBeenCalledTimes(2); + }); + + it('routes say() to nextActivity when current activity is paused', () => { + const session = new AgentSession({}); + const handle = SpeechHandle.create(); + + const pausedActivity = { + schedulingPaused: true, + say: vi.fn(() => { + throw new Error('should not call paused activity say()'); + }), + }; + const nextActivity = { + say: vi.fn(() => handle), + }; + + const watchHandle = vi.fn(); + + (session as any).activity = pausedActivity; + (session as any).nextActivity = nextActivity; + (session as any)._globalRunState = { _watchHandle: watchHandle }; + + const result = session.say('hello'); + + expect(result).toBe(handle); + expect(nextActivity.say).toHaveBeenCalledTimes(1); + expect(pausedActivity.say).not.toHaveBeenCalled(); + expect(watchHandle).toHaveBeenCalledWith(handle); + }); + + it('forces interrupt and commits user turn during non-error close', async () => { + const session = new AgentSession({}); + (session as any).started = true; + + const interruptFuture = new Future(); + interruptFuture.resolve(); + + const activity = { + interrupt: vi.fn(() => interruptFuture), + drain: vi.fn(async () => {}), + currentSpeech: { waitForPlayout: vi.fn(async () => {}) }, + commitUserTurn: vi.fn(), + detachAudioInput: vi.fn(), + close: vi.fn(async () => {}), + }; + + (session as any).activity = activity; + await (session as any).closeImplInner(CloseReason.USER_INITIATED, null, false); + + expect(activity.interrupt).toHaveBeenCalledWith({ force: true }); + expect(activity.commitUserTurn).toHaveBeenCalledWith({ + audioDetached: true, + throwIfNotReady: false, + }); + expect(activity.drain).toHaveBeenCalledTimes(1); + expect(activity.close).toHaveBeenCalledTimes(1); + }); + + it('does not commit user turn during error close', async () => { + const session = new AgentSession({}); + (session as any).started = true; + + const interruptFuture = new Future(); + interruptFuture.resolve(); + + const activity = { + interrupt: vi.fn(() => interruptFuture), + drain: vi.fn(async () => {}), + currentSpeech: { waitForPlayout: vi.fn(async () => {}) }, + commitUserTurn: vi.fn(), + detachAudioInput: vi.fn(), + close: vi.fn(async () => {}), + }; + + (session as any).activity = activity; + await (session as any).closeImplInner(CloseReason.ERROR, null, false); + + expect(activity.commitUserTurn).not.toHaveBeenCalled(); + }); + + it('forwards force option through session interrupt()', () => { + const session = new AgentSession({}); + const interruptFuture = new Future(); + const activity = { + interrupt: vi.fn(() => interruptFuture), + }; + + (session as any).activity = activity; + const returned = session.interrupt({ force: true }); + + expect(returned).toBe(interruptFuture); + expect(activity.interrupt).toHaveBeenCalledWith({ force: true }); + }); + + it('honors waitOnEnter by awaiting onEnter task completion', async () => { + const session = new AgentSession({}); + const agent = new Agent({ instructions: 'wait on enter agent' }); + const previousAgent = new Agent({ instructions: 'previous agent' }); + + (session as any).activity = { + agent: previousAgent, + drain: vi.fn(async () => {}), + close: vi.fn(async () => {}), + }; + + const startSpy = vi.spyOn(AgentActivity.prototype, 'start').mockImplementation(async function ( + this: AgentActivity, + ) { + this._onEnterTask = Task.from(async () => { + await new Promise((resolve) => setTimeout(resolve, 20)); + }); + }); + + const startedAt = Date.now(); + await (session as any)._updateActivity(agent, { waitOnEnter: true }); + const elapsed = Date.now() - startedAt; + + expect(startSpy).toHaveBeenCalledTimes(1); + expect(elapsed).toBeGreaterThanOrEqual(15); + + startSpy.mockRestore(); + }); +}); diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 35e94cd99..23282604e 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -1,12 +1,14 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { Mutex } from '@livekit/mutex'; import type { AudioFrame, Room } from '@livekit/rtc-node'; import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter'; import type { Context, Span } from '@opentelemetry/api'; import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api'; import { EventEmitter } from 'node:events'; import type { ReadableStream } from 'node:stream/web'; +import { z } from 'zod'; import { LLM as InferenceLLM, STT as InferenceSTT, @@ -31,6 +33,7 @@ import { type ResolvedSessionConnectOptions, type SessionConnectOptions, } from '../types.js'; +import { Task } from '../utils.js'; import type { VAD } from '../vad.js'; import type { Agent } from './agent.js'; import { AgentActivity } from './agent_activity.js'; @@ -114,6 +117,13 @@ export type AgentSessionOptions = { connOptions?: SessionConnectOptions; }; +type ActivityTransitionOptions = { + previousActivity?: 'close' | 'pause'; + newActivity?: 'start' | 'resume'; + blockedTasks?: Task[]; + waitOnEnter?: boolean; +}; + export class AgentSession< UserData = UnknownUserData, > extends (EventEmitter as new () => TypedEmitter) { @@ -128,8 +138,10 @@ export class AgentSession< private agent?: Agent; private activity?: AgentActivity; private nextActivity?: AgentActivity; + private updateActivityTask?: Task; private started = false; private userState: UserState = 'listening'; + private readonly activityLock = new Mutex(); private roomIO?: RoomIO; private logger = log(); @@ -358,7 +370,8 @@ export class AgentSession< } // TODO(AJS-265): add shutdown callback to job context - tasks.push(this.updateActivity(this.agent)); + // Initial start does not wait on onEnter + tasks.push(this._updateActivity(this.agent, { waitOnEnter: false })); await Promise.allSettled(tasks); @@ -430,8 +443,34 @@ export class AgentSession< updateAgent(agent: Agent): void { this.agent = agent; - if (this.started) { - this.updateActivity(agent); + if (!this.started) { + return; + } + + const _updateActivityTask = async (oldTask: Task | undefined, agent: Agent) => { + if (oldTask) { + try { + await oldTask.result; + } catch (error) { + this.logger.error(error, 'previous updateAgent transition failed'); + } + } + + await this._updateActivity(agent); + }; + + const oldTask = this.updateActivityTask; + this.updateActivityTask = Task.from( + async () => _updateActivityTask(oldTask, agent), + undefined, + 'AgentSession_updateActivityTask', + ); + + const runState = this._globalRunState; + if (runState) { + // Don't mark the RunResult as done, if there is currently an agent transition happening. + // (used to make sure we're correctly adding the AgentHandoffResult before completion) + runState._watchHandle(this.updateActivityTask); } } @@ -462,24 +501,43 @@ export class AgentSession< throw new Error('AgentSession is not running'); } - const doSay = (activity: AgentActivity) => { + // Ref: python agent_session.py 907-927 + const doSay = (activity: AgentActivity, nextActivity?: AgentActivity) => { + if (activity.schedulingPaused) { + if (!nextActivity) { + throw new Error('AgentSession is closing, cannot use say()'); + } + return nextActivity.say(text, options); + } return activity.say(text, options); }; + const runState = this._globalRunState; + let handle: SpeechHandle; + // attach to the session span if called outside of the AgentSession const activeSpan = trace.getActiveSpan(); if (!activeSpan && this.rootSpanContext) { - return otelContext.with(this.rootSpanContext, () => doSay(this.activity!)); + handle = otelContext.with(this.rootSpanContext, () => + doSay(this.activity!, this.nextActivity), + ); + } else { + handle = doSay(this.activity, this.nextActivity); } - return doSay(this.activity); + if (runState) { + runState._watchHandle(handle); + } + + return handle; } - interrupt() { + interrupt(options?: { force?: boolean }) { if (!this.activity) { throw new Error('AgentSession is not running'); } - return this.activity.interrupt(); + + return this.activity.interrupt(options); } generateReply(options?: { @@ -500,7 +558,7 @@ export class AgentSession< : undefined; const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => { - if (activity.draining) { + if (activity.schedulingPaused) { if (!nextActivity) { throw new Error('AgentSession is closing, cannot use generateReply()'); } @@ -540,53 +598,115 @@ export class AgentSession< * result.expect.noMoreEvents(); * ``` * - * @param options - Run options including user input + * @param options - Run options including user input and optional output type * @returns A RunResult that resolves when the agent finishes responding - * - * TODO: Add outputType parameter for typed outputs (parity with Python) */ - run(options: { userInput: string }): RunResult { + run({ + userInput, + outputType, + }: { + userInput: string; + outputType?: z.ZodType; + }): RunResult { if (this._globalRunState && !this._globalRunState.done()) { throw new Error('nested runs are not supported'); } - const runState = new RunResult({ userInput: options.userInput }); + const runState = new RunResult({ + userInput, + outputType, + }); + this._globalRunState = runState; - this.generateReply({ userInput: options.userInput }); + this.generateReply({ userInput }); return runState; } - private async updateActivity(agent: Agent): Promise { + /** @internal */ + async _updateActivity(agent: Agent, options: ActivityTransitionOptions = {}): Promise { + const { previousActivity = 'close', newActivity = 'start', blockedTasks = [] } = options; + const waitOnEnter = options.waitOnEnter ?? newActivity === 'start'; + const runWithContext = async () => { - // TODO(AJS-129): add lock to agent activity core lifecycle - this.nextActivity = new AgentActivity(agent, this); + const unlock = await this.activityLock.lock(); + let onEnterTask: Task | undefined; - const previousActivity = this.activity; + try { + this.agent = agent; + const prevActivityObj = this.activity; + + if (newActivity === 'start') { + const prevAgent = prevActivityObj?.agent; + if ( + agent._agentActivity && + // allow updating the same agent that is running + (agent !== prevAgent || previousActivity !== 'close') + ) { + throw new Error('Cannot start agent: an activity is already running'); + } + this.nextActivity = new AgentActivity(agent, this); + } else if (newActivity === 'resume') { + if (!agent._agentActivity) { + throw new Error('Cannot resume agent: no existing activity to resume'); + } + this.nextActivity = agent._agentActivity; + } - if (this.activity) { - await this.activity.drain(); - await this.activity.close(); - } + if (prevActivityObj && prevActivityObj !== this.nextActivity) { + if (previousActivity === 'pause') { + await prevActivityObj.pause({ blockedTasks }); + } else { + await prevActivityObj.drain(); + await prevActivityObj.close(); + } + } else { + throw new Error('Invalid activity transition'); + } - this.activity = this.nextActivity; - this.nextActivity = undefined; + this.activity = this.nextActivity; + this.nextActivity = undefined; - this._chatCtx.insert( - new AgentHandoffItem({ - oldAgentId: previousActivity?.agent.id, + const runState = this._globalRunState; + const handoffItem = new AgentHandoffItem({ + oldAgentId: prevActivityObj?.agent.id, newAgentId: agent.id, - }), - ); - this.logger.debug( - { previousAgentId: previousActivity?.agent.id, newAgentId: agent.id }, - 'Agent handoff inserted into chat context', - ); + }); - await this.activity.start(); + if (runState) { + runState._agentHandoff({ + item: handoffItem, + oldAgent: prevActivityObj?.agent, + newAgent: this.activity!.agent, + }); + } - if (this._input.audio) { - this.activity.attachAudioInput(this._input.audio.stream); + this._chatCtx.insert(handoffItem); + this.logger.debug( + { previousAgentId: prevActivityObj?.agent.id, newAgentId: agent.id }, + 'Agent handoff inserted into chat context', + ); + + if (newActivity === 'start') { + await this.activity!.start(); + } else { + await this.activity!.resume(); + } + + onEnterTask = this.activity!._onEnterTask; + + if (this._input.audio) { + this.activity!.attachAudioInput(this._input.audio.stream); + } + } finally { + unlock(); + } + + if (waitOnEnter) { + if (!onEnterTask) { + throw new Error('expected onEnter task to be available while waitOnEnter=true'); + } + await onEnterTask.result; } }; @@ -836,15 +956,21 @@ export class AgentSession< if (this.activity) { if (!drain) { try { - this.activity.interrupt(); + await this.activity.interrupt({ force: true }).await; } catch (error) { - // TODO(shubhra): force interrupt or wait for it to finish? - // it might be an audio played from the error callback + // Uninterruptible speech can throw during forced interruption. + this.logger.warn({ error }, 'Error interrupting activity'); } } + await this.activity.drain(); // wait any uninterruptible speech to finish await this.activity.currentSpeech?.waitForPlayout(); + + if (reason !== CloseReason.ERROR) { + this.activity.commitUserTurn({ audioDetached: true, throwIfNotReady: false }); + } + try { this.activity.detachAudioInput(); } catch (error) { diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index fd274a66e..17bdecbc7 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -26,7 +26,13 @@ import { IdentityTransform } from '../stream/identity_transform.js'; import { traceTypes, tracer } from '../telemetry/index.js'; import { USERDATA_TIMED_TRANSCRIPT } from '../types.js'; import { Future, Task, shortuuid, toError, waitForAbort } from '../utils.js'; -import { type Agent, type ModelSettings, asyncLocalStorage, isStopResponse } from './agent.js'; +import { + type Agent, + type ModelSettings, + _setActivityTaskInfo, + functionCallStorage, + isStopResponse, +} from './agent.js'; import type { AgentSession } from './agent_session.js'; import { AudioOutput, @@ -719,7 +725,7 @@ export interface _AudioOut { async function forwardAudio( ttsStream: ReadableStream, - audioOuput: AudioOutput, + audioOutput: AudioOutput, out: _AudioOut, signal?: AbortSignal, ): Promise { @@ -733,8 +739,8 @@ async function forwardAudio( }; try { - audioOuput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted); - audioOuput.resume(); + audioOutput.on(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted); + audioOutput.resume(); while (true) { if (signal?.aborted) { @@ -748,36 +754,36 @@ async function forwardAudio( if ( !out.firstFrameFut.done && - audioOuput.sampleRate && - audioOuput.sampleRate !== frame.sampleRate && + audioOutput.sampleRate && + audioOutput.sampleRate !== frame.sampleRate && !resampler ) { - resampler = new AudioResampler(frame.sampleRate, audioOuput.sampleRate, 1); + resampler = new AudioResampler(frame.sampleRate, audioOutput.sampleRate, 1); } if (resampler) { for (const f of resampler.push(frame)) { - await audioOuput.captureFrame(f); + await audioOutput.captureFrame(f); } } else { - await audioOuput.captureFrame(frame); + await audioOutput.captureFrame(frame); } } if (resampler) { for (const f of resampler.flush()) { - await audioOuput.captureFrame(f); + await audioOutput.captureFrame(f); } } } finally { - audioOuput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted); + audioOutput.off(AudioOutput.EVENT_PLAYBACK_STARTED, onPlaybackStarted); if (!out.firstFrameFut.done) { out.firstFrameFut.reject(new Error('audio forwarding cancelled before playback started')); } reader?.releaseLock(); - audioOuput.flush(); + audioOutput.flush(); } } @@ -836,7 +842,7 @@ export function performToolExecutions({ const signal = controller.signal; const reader = toolCallStream.getReader(); - const tasks: Promise[] = []; + const tasks: Task[] = []; while (!signal.aborted) { const { done, value: toolCall } = await reader.read(); if (signal.aborted) break; @@ -929,7 +935,7 @@ export function performToolExecutions({ 'Executing LLM tool call', ); - const toolExecution = asyncLocalStorage.run({ functionCall: toolCall }, async () => { + const toolExecution = functionCallStorage.run({ functionCall: toolCall }, async () => { return await tool.execute(parsedArgs, { ctx: new RunContext(session, speechHandle, toolCall), toolCallId: toolCall.callId, @@ -993,11 +999,24 @@ export function performToolExecutions({ name: 'function_tool', }); + const toolTask = Task.from( + async () => { + await tracableToolExecution(toolExecution); + }, + controller, + `performToolExecution:${toolCall.name}`, + ); + + _setActivityTaskInfo(toolTask, { + speechHandle, + functionCall: toolCall, + inlineTask: true, + }); // wait, not cancelling all tool calling tasks - tasks.push(tracableToolExecution(toolExecution)); + tasks.push(toolTask); } - await Promise.allSettled(tasks); + await Promise.allSettled(tasks.map((task) => task.result)); if (toolOutput.output.length > 0) { logger.debug( { diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index 655e846b6..947013336 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2025 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -export { Agent, StopResponse, type AgentOptions, type ModelSettings } from './agent.js'; +export { Agent, AgentTask, StopResponse, type AgentOptions, type ModelSettings } from './agent.js'; export { AgentSession, type AgentSessionOptions, type VoiceOptions } from './agent_session.js'; export * from './avatar/index.js'; export * from './background_audio.js'; diff --git a/agents/src/voice/speech_handle.ts b/agents/src/voice/speech_handle.ts index e491b3b99..b5e241f40 100644 --- a/agents/src/voice/speech_handle.ts +++ b/agents/src/voice/speech_handle.ts @@ -5,7 +5,7 @@ import type { Context } from '@opentelemetry/api'; import type { ChatItem } from '../llm/index.js'; import type { Task } from '../utils.js'; import { Event, Future, shortuuid } from '../utils.js'; -import { asyncLocalStorage } from './agent.js'; +import { functionCallStorage } from './agent.js'; /** Symbol used to identify SpeechHandle instances */ const SPEECH_HANDLE_SYMBOL = Symbol.for('livekit.agents.SpeechHandle'); @@ -46,6 +46,9 @@ export class SpeechHandle { /** @internal - OpenTelemetry context for the agent turn span */ _agentTurnContext?: Context; + /** @internal - used by AgentTask/RunResult final output plumbing */ + _maybeRunFinalOutput?: unknown; + private itemAddedCallbacks: Set<(item: ChatItem) => void> = new Set(); private doneCallbacks: Set<(sh: SpeechHandle) => void> = new Set(); @@ -148,7 +151,7 @@ export class SpeechHandle { * has entirely played out, including any tool calls and response follow-ups. */ async waitForPlayout(): Promise { - const store = asyncLocalStorage.getStore(); + const store = functionCallStorage.getStore(); if (store && store?.functionCall) { throw new Error( `Cannot call 'SpeechHandle.waitForPlayout()' from inside the function tool '${store.functionCall.name}'. ` + diff --git a/agents/src/voice/testing/run_result.test.ts b/agents/src/voice/testing/run_result.test.ts new file mode 100644 index 000000000..ffb40c315 --- /dev/null +++ b/agents/src/voice/testing/run_result.test.ts @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { z } from 'zod'; +import { ChatMessage } from '../../llm/chat_context.js'; +import { initializeLogger } from '../../log.js'; +import { SpeechHandle } from '../speech_handle.js'; +import { RunResult } from './run_result.js'; + +initializeLogger({ pretty: false, level: 'error' }); + +describe('RunResult', () => { + it('removes speech item callback when unwatching a handle', () => { + const result = new RunResult(); + const handle = SpeechHandle.create(); + + result._watchHandle(handle); + result._unwatchHandle(handle); + + const message = ChatMessage.create({ + role: 'assistant', + content: 'hello', + }); + handle._itemAdded([message]); + + expect(result.events).toHaveLength(0); + + // Done callback is removed too, so run should not complete automatically. + handle._markDone(); + expect(result.done()).toBe(false); + + // Mirrors AgentTask.run() calling _markDoneIfNeeded() after unwatch. + result._markDoneIfNeeded(); + expect(result.done()).toBe(true); + }); + + it('exposes finalOutput when output type matches', async () => { + const result = new RunResult({ outputType: z.string() }); + const handle = SpeechHandle.create(); + + result._watchHandle(handle); + handle._maybeRunFinalOutput = 'ok'; + handle._markDone(); + + await result.wait(); + expect(result.finalOutput).toBe('ok'); + }); + + it('rejects run when final output type mismatches expected outputType', async () => { + const result = new RunResult({ outputType: z.number() }); + const handle = SpeechHandle.create(); + + result._watchHandle(handle); + handle._maybeRunFinalOutput = 'not a number'; + handle._markDone(); + + await expect(result.wait()).rejects.toThrow('Expected output matching provided zod schema'); + }); + + it('rejects run when final output is an error', async () => { + const result = new RunResult(); + const handle = SpeechHandle.create(); + + result._watchHandle(handle); + handle._maybeRunFinalOutput = new Error('boom'); + handle._markDone(); + + await expect(result.wait()).rejects.toThrow('boom'); + }); + + it('throws when accessing finalOutput before completion', () => { + const result = new RunResult(); + expect(() => result.finalOutput).toThrow('cannot retrieve finalOutput, RunResult is not done'); + }); +}); diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts index ea9f1d994..6887c500c 100644 --- a/agents/src/voice/testing/run_result.ts +++ b/agents/src/voice/testing/run_result.ts @@ -30,6 +30,8 @@ import { // Type for agent constructor (used in assertions) // eslint-disable-next-line @typescript-eslint/no-explicit-any type AgentConstructor = new (...args: any[]) => Agent; +// In JS we use a zod schema so runtime validation and TS generic inference stay aligned. +type OutputSchema = z.ZodType; // Environment variable for verbose output const evalsVerbose = parseInt(process.env.LIVEKIT_EVALS_VERBOSE || '0', 10); @@ -48,19 +50,21 @@ export class RunResult { private _events: RunEvent[] = []; private doneFut = new Future(); private userInput?: string; + private outputType?: OutputSchema; + private finalOutputValue?: T; + private hasFinalOutput = false; private handles: Set> = new Set(); private lastSpeechHandle?: SpeechHandle; private runAssert?: RunAssert; + // Store per-handle closures so _unwatchHandle can remove callbacks symmetrically. + private doneCallbacks = new Map, () => void>(); - // TODO(brian): Add typed output support for parity with Python - // - Add outputType?: new (...args: unknown[]) => T - // - Add finalOutput?: T - // - Implement markDone() to extract final_output from SpeechHandle.maybeRunFinalOutput - // - See Python: run_result.py lines 182-201 + private readonly itemAddedCallback = (item: ChatItem) => this._itemAdded(item); - constructor(options?: { userInput?: string }) { + constructor(options?: { userInput?: string; outputType?: OutputSchema }) { this.userInput = options?.userInput; + this.outputType = options?.outputType; } /** @@ -92,12 +96,17 @@ export class RunResult { /** * Returns the final output of the run after completion. - * - * @throws Error - Not implemented yet. */ get finalOutput(): T { - // TODO(brian): Implement typed output support after AgentTask is implemented. - throw new Error('finalOutput is not yet implemented in JS.'); + if (!this.doneFut.done) { + throw new Error('cannot retrieve finalOutput, RunResult is not done'); + } + + if (!this.hasFinalOutput) { + throw new Error('no final output'); + } + + return this.finalOutputValue as T; } /** @@ -167,15 +176,18 @@ export class RunResult { * Watch a speech handle or task for completion. */ _watchHandle(handle: SpeechHandle | Task): void { + if (this.handles.has(handle)) return; + this.handles.add(handle); if (isSpeechHandle(handle)) { - handle._addItemAddedCallback(this._itemAdded.bind(this)); + handle._addItemAddedCallback(this.itemAddedCallback); } - handle.addDoneCallback(() => { - this._markDoneIfNeeded(handle); - }); + const doneCallback = () => this._markDoneIfNeeded(handle); + + this.doneCallbacks.set(handle, doneCallback); + handle.addDoneCallback(doneCallback); } /** @@ -184,13 +196,20 @@ export class RunResult { */ _unwatchHandle(handle: SpeechHandle | Task): void { this.handles.delete(handle); + const doneCallback = this.doneCallbacks.get(handle); + + if (doneCallback) { + handle.removeDoneCallback(doneCallback); + this.doneCallbacks.delete(handle); + } if (isSpeechHandle(handle)) { - handle._removeItemAddedCallback(this._itemAdded.bind(this)); + handle._removeItemAddedCallback(this.itemAddedCallback); } } - private _markDoneIfNeeded(handle: SpeechHandle | Task): void { + /** @internal */ + _markDoneIfNeeded(handle?: SpeechHandle | Task | null): void { if (isSpeechHandle(handle)) { this.lastSpeechHandle = handle; } @@ -201,14 +220,42 @@ export class RunResult { } private _markDone(): void { - // TODO(brian): Implement final output support after AgentTask is implemented. - // See Python run_result.py _mark_done() for reference: - // - Check lastSpeechHandle._maybeRunFinalOutput - // - Validate output type matches expected type - // - Set exception or resolve based on output - if (!this.doneFut.done) { + if (this.doneFut.done) { + return; + } + + if (!this.lastSpeechHandle) { + this.doneFut.resolve(); + return; + } + + const finalOutput = this.lastSpeechHandle._maybeRunFinalOutput; + if (finalOutput instanceof Error) { + this.doneFut.reject(finalOutput); + return; + } + + if (this.outputType) { + // Python does `isinstance(final_output, output_type)`. + // JS uses zod schema validation and takes the parsed value as the typed final output. + const result = this.outputType.safeParse(finalOutput); + if (!result.success) { + this.doneFut.reject( + new Error(`Expected output matching provided zod schema: ${result.error.message}`), + ); + return; + } + this.finalOutputValue = result.data; + this.hasFinalOutput = true; this.doneFut.resolve(); + return; + } + + if (finalOutput !== undefined) { + this.finalOutputValue = finalOutput as T; + this.hasFinalOutput = true; } + this.doneFut.resolve(); } /** diff --git a/examples/src/agent_task_survey.ts b/examples/src/agent_task_survey.ts new file mode 100644 index 000000000..db460052e --- /dev/null +++ b/examples/src/agent_task_survey.ts @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type JobContext, + type JobProcess, + WorkerOptions, + cli, + defineAgent, + inference, + llm, + voice, +} from '@livekit/agents'; +import * as silero from '@livekit/agents-plugin-silero'; +import { fileURLToPath } from 'node:url'; +import { z } from 'zod'; + +type SurveyResult = { + name: string; + role: string; +}; + +// Ref: python examples/survey/survey_agent.py - 248-274 lines. +class IntroTask extends voice.AgentTask { + constructor() { + super({ + instructions: + 'Collect the user name and role. Ask concise follow-up questions if information is missing.', + tools: { + completeIntro: llm.tool({ + description: 'Call this after collecting the user name and role.', + parameters: z.object({ + name: z.string().describe('User name'), + role: z.string().describe('User role'), + }), + execute: async ({ name, role }) => { + this.complete({ name, role }); + return 'Thanks, collected successfully.'; + }, + }), + }, + }); + } + + async onEnter() { + this.session.generateReply(); + } +} + +class SurveyAgent extends voice.Agent { + constructor() { + super({ + instructions: + 'You orchestrate a short intro survey. Speak naturally and keep the interaction brief.', + }); + } + + async onEnter() { + // Ref: python examples/survey/survey_agent.py - 284-327 lines. + const result = await new IntroTask().run(this.session); + await this.session.say( + `Great to meet you ${result.name}. I noted your role as ${result.role}. We can continue now.`, + { addToChatCtx: true }, + ); + } +} + +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const session = new voice.AgentSession({ + vad: ctx.proc.userData.vad as silero.VAD, + stt: new inference.STT({ model: 'deepgram/nova-3' }), + llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), + tts: new inference.TTS({ + model: 'cartesia/sonic-3', + voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', + }), + }); + + await session.start({ + room: ctx.room, + agent: new SurveyAgent(), + }); + + await ctx.waitForParticipant(); + }, +}); + +cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); diff --git a/plugins/google/src/beta/realtime/realtime_api.ts b/plugins/google/src/beta/realtime/realtime_api.ts index 7f0b3d33c..90866729c 100644 --- a/plugins/google/src/beta/realtime/realtime_api.ts +++ b/plugins/google/src/beta/realtime/realtime_api.ts @@ -309,6 +309,7 @@ export class RealtimeModel extends llm.RealtimeModel { userTranscription: inputAudioTranscription !== null, autoToolReplyGeneration: true, audioOutput: options.modalities?.includes(Modality.AUDIO) ?? true, + manualFunctionCalls: false, }); // Environment variable fallbacks diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts index 318e72134..592f1c7c4 100644 --- a/plugins/livekit/src/turn_detector/multilingual.ts +++ b/plugins/livekit/src/turn_detector/multilingual.ts @@ -135,11 +135,12 @@ export class MultilingualModel extends EOUModel { } function remoteInferenceUrl() { - const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; - if (!urlBase) { - return undefined; - } - return `${urlBase}/eot/multi`; + return undefined; + // const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; + // if (!urlBase) { + // return undefined; + // } + // return `${urlBase}/eot/multi`; } export default EUORunnerMultilingual; diff --git a/plugins/openai/src/realtime/realtime_model.ts b/plugins/openai/src/realtime/realtime_model.ts index 51b28afed..1aaffd014 100644 --- a/plugins/openai/src/realtime/realtime_model.ts +++ b/plugins/openai/src/realtime/realtime_model.ts @@ -175,6 +175,7 @@ export class RealtimeModel extends llm.RealtimeModel { userTranscription: options.inputAudioTranscription !== null, autoToolReplyGeneration: false, audioOutput: modalities.includes('audio'), + manualFunctionCalls: true, }); const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment); diff --git a/plugins/openai/src/realtime/realtime_model_beta.ts b/plugins/openai/src/realtime/realtime_model_beta.ts index 2db65104f..19aee2aee 100644 --- a/plugins/openai/src/realtime/realtime_model_beta.ts +++ b/plugins/openai/src/realtime/realtime_model_beta.ts @@ -176,6 +176,7 @@ export class RealtimeModel extends llm.RealtimeModel { userTranscription: options.inputAudioTranscription !== null, autoToolReplyGeneration: false, audioOutput: modalities.includes('audio'), + manualFunctionCalls: true, }); const isAzure = !!(options.apiVersion || options.entraToken || options.azureDeployment); From 6458a99e74394904b1a8526ae353428a4c24c3f0 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Wed, 11 Feb 2026 17:04:56 -0800 Subject: [PATCH 02/21] Update job_proc_lazy_main.ts --- agents/src/ipc/job_proc_lazy_main.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agents/src/ipc/job_proc_lazy_main.ts b/agents/src/ipc/job_proc_lazy_main.ts index f81eedc29..8dd5cf6f8 100644 --- a/agents/src/ipc/job_proc_lazy_main.ts +++ b/agents/src/ipc/job_proc_lazy_main.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2024 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 -import { Room, RoomEvent, dispose } from '@livekit/rtc-node'; +import { Room, RoomEvent } from '@livekit/rtc-node'; import { EventEmitter, once } from 'node:events'; import { pathToFileURL } from 'node:url'; import type { Logger } from 'pino'; @@ -245,8 +245,6 @@ const startJob = ( await join.await; - await dispose(); - logger.debug('Job process shutdown'); process.exit(0); } From 66869f25528a548399b1ea2e5ebcb8ff74e7dd5a Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Wed, 11 Feb 2026 18:21:46 -0800 Subject: [PATCH 03/21] bug fixes --- agents/src/voice/agent_activity.ts | 7 + agents/src/voice/agent_session.ts | 2 +- agents/src/voice/audio_recognition.ts | 4 + agents/src/voice/generation.ts | 26 +- examples/src/agent_task_survey.ts | 3 - examples/src/testing/agent_task.test.ts | 431 ++++++++++++++++++++++++ 6 files changed, 461 insertions(+), 12 deletions(-) create mode 100644 examples/src/testing/agent_task.test.ts diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 656bb0a57..cff6f4181 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -868,6 +868,13 @@ export class AgentActivity implements RecognitionHooks { const wrappedFn = (ctrl: AbortController) => { return agentActivityStorage.run(this, () => { + // Mark inline/speech metadata at task runtime to avoid a race where taskFn executes + // before post-construction metadata is attached to the Task instance. + const currentTask = Task.current(); + if (currentTask) { + _setActivityTaskInfo(currentTask, { speechHandle: ownedSpeechHandle, inlineTask }); + } + if (ownedSpeechHandle) { return speechHandleStorage.run(ownedSpeechHandle, () => taskFn(ctrl)); } diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 23282604e..64bfabeff 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -661,7 +661,7 @@ export class AgentSession< await prevActivityObj.close(); } } else { - throw new Error('Invalid activity transition'); + throw new Error('Invalid agent activity transition'); } this.activity = this.nextActivity; diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index 2660804d8..1a898f083 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -692,6 +692,10 @@ export class AudioRecognition { this.logger.debug('User turn committed'); }) .catch((err: unknown) => { + if (err instanceof Error && err.name === 'AbortError') { + this.logger.debug('User turn commit task cancelled'); + return; + } this.logger.error(err, 'Error in user turn commit task:'); }); } diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 17bdecbc7..1f141ab37 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -935,14 +935,6 @@ export function performToolExecutions({ 'Executing LLM tool call', ); - const toolExecution = functionCallStorage.run({ functionCall: toolCall }, async () => { - return await tool.execute(parsedArgs, { - ctx: new RunContext(session, speechHandle, toolCall), - toolCallId: toolCall.callId, - abortSignal: signal, - }); - }); - const _tracableToolExecutionImpl = async (toolExecTask: Promise, span: Span) => { span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_NAME, toolCall.name); span.setAttribute(traceTypes.ATTR_FUNCTION_TOOL_ARGS, toolCall.args); @@ -1001,6 +993,24 @@ export function performToolExecutions({ const toolTask = Task.from( async () => { + // Ensure this task is marked inline before user tool code executes. + const currentTask = Task.current(); + if (currentTask) { + _setActivityTaskInfo(currentTask, { + speechHandle, + functionCall: toolCall, + inlineTask: true, + }); + } + + const toolExecution = functionCallStorage.run({ functionCall: toolCall }, async () => { + return await tool.execute(parsedArgs, { + ctx: new RunContext(session, speechHandle, toolCall), + toolCallId: toolCall.callId, + abortSignal: signal, + }); + }); + await tracableToolExecution(toolExecution); }, controller, diff --git a/examples/src/agent_task_survey.ts b/examples/src/agent_task_survey.ts index db460052e..b00caa3e7 100644 --- a/examples/src/agent_task_survey.ts +++ b/examples/src/agent_task_survey.ts @@ -20,7 +20,6 @@ type SurveyResult = { role: string; }; -// Ref: python examples/survey/survey_agent.py - 248-274 lines. class IntroTask extends voice.AgentTask { constructor() { super({ @@ -84,8 +83,6 @@ export default defineAgent({ room: ctx.room, agent: new SurveyAgent(), }); - - await ctx.waitForParticipant(); }, }); diff --git a/examples/src/testing/agent_task.test.ts b/examples/src/testing/agent_task.test.ts new file mode 100644 index 000000000..f8e6820de --- /dev/null +++ b/examples/src/testing/agent_task.test.ts @@ -0,0 +1,431 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { Future, initializeLogger, llm, voice } from '@livekit/agents'; +import * as openai from '@livekit/agents-plugin-openai'; +import { afterEach, describe, expect, it } from 'vitest'; +import { z } from 'zod'; + +initializeLogger({ pretty: true, level: 'warn' }); + +function asError(error: unknown): Error { + return error instanceof Error ? error : new Error(String(error)); +} + +describe('AgentTask examples', { timeout: 120_000 }, () => { + const sessions: voice.AgentSession[] = []; + + afterEach(async () => { + await Promise.allSettled(sessions.map((s) => s.close())); + sessions.length = 0; + }); + + async function startSession(agent: voice.Agent, options?: { llm?: openai.LLM }) { + const session = new voice.AgentSession({ llm: options?.llm }); + sessions.push(session); + await session.start({ agent }); + return session; + } + + it('agent calls a task in onEnter', async () => { + const done = new Future(); + + // Ref: python livekit-agents/livekit/agents/voice/agent.py - 739-841 lines. + class WelcomeTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Collect a welcome token and finish quickly.' }); + } + + async onEnter() { + this.complete('welcome-token'); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ instructions: 'Parent agent used for AgentTask lifecycle tests.' }); + } + + async onEnter() { + try { + const result = await new WelcomeTask().run(); + done.resolve(result); + } catch (error) { + done.reject(asError(error)); + } + } + } + + await startSession(new ParentAgent()); + await expect(done.await).resolves.toBe('welcome-token'); + }); + + it('agent calls two tasks in onEnter', async () => { + const done = new Future<{ first: number; second: number; order: string[] }>(); + + class FirstTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Return first value.' }); + } + + async onEnter() { + this.complete(1); + } + } + + class SecondTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Return second value.' }); + } + + async onEnter() { + this.complete(2); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ instructions: 'Parent agent for sequential task orchestration.' }); + } + + async onEnter() { + try { + const order: string[] = []; + const first = await new FirstTask().run(); + order.push('first'); + const second = await new SecondTask().run(); + order.push('second'); + done.resolve({ first, second, order }); + } catch (error) { + done.reject(asError(error)); + } + } + } + + await startSession(new ParentAgent()); + await expect(done.await).resolves.toEqual({ + first: 1, + second: 2, + order: ['first', 'second'], + }); + }); + + const itIfOpenAI = process.env.OPENAI_API_KEY ? it : it.skip; + + itIfOpenAI( + 'agent calls a task in a tool; resuming previous activity does not execute onEnter again', + async () => { + let parentOnEnterCount = 0; + let taskOnEnterCount = 0; + let toolCallCount = 0; + + class GetEmailAddressTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Capture an email address and complete.' }); + } + + async onEnter() { + taskOnEnterCount += 1; + this.complete('alice@example.com'); + } + } + + class ToolAgent extends voice.Agent { + constructor() { + super({ + instructions: + 'When asked to capture email, ALWAYS call captureEmail exactly once, then respond briefly.', + tools: { + captureEmail: llm.tool({ + description: 'Capture an email by running a nested AgentTask.', + parameters: z.object({}), + execute: async () => { + toolCallCount += 1; + try { + const email = await new GetEmailAddressTask().run(); + return `captured:${email}`; + } catch (error) { + throw error; + } + }, + }), + }, + }); + } + + async onEnter() { + parentOnEnterCount += 1; + } + } + + const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const session = await startSession(new ToolAgent(), { llm: llmModel }); + const result = session.run({ userInput: 'Please capture my email using your tool.' }); + await result.wait(); + + result.expect.containsFunctionCall({ name: 'captureEmail' }); + + expect(toolCallCount).toBe(1); + expect(taskOnEnterCount).toBe(1); + // Critical parity check: resume path must not run parent onEnter again. + expect(parentOnEnterCount).toBe(1); + }, + ); + + itIfOpenAI('LLM-powered IntroTask (python survey parity) records intro details', async () => { + let introTaskResult: { name: string; intro: string } | undefined; + let runIntroTaskCalls = 0; + let recordIntroToolCalls = 0; + + // Ref: python examples/survey/survey_agent.py - 248-274 lines. + class IntroTask extends voice.AgentTask<{ name: string; intro: string }> { + constructor() { + super({ + instructions: + 'You are Alex, an interviewer. Extract the candidate name and a short intro from the latest user input. ' + + 'Use the tool recordIntro exactly once when both are available.', + tools: { + recordIntro: llm.tool({ + description: 'Record candidate name and intro summary.', + parameters: z.object({ + name: z.string().describe('Candidate name'), + introNotes: z.string().describe('A concise candidate intro summary'), + }), + execute: async ({ name, introNotes }) => { + recordIntroToolCalls += 1; + this.complete({ name, intro: introNotes }); + return 'Intro recorded.'; + }, + }), + }, + }); + } + + async onEnter() { + this.session.generateReply({ + instructions: + 'Ask the user for name and intro if missing, then call recordIntro with concise values.', + }); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ + instructions: + 'When the user asks to run the intro task, ALWAYS call collectIntroWithTask exactly once.', + tools: { + collectIntroWithTask: llm.tool({ + description: 'Launch the IntroTask and return the captured intro details.', + parameters: z.object({}), + execute: async () => { + runIntroTaskCalls += 1; + const result = await new IntroTask().run(); + introTaskResult = result; + return JSON.stringify(result); + }, + }), + }, + }); + } + } + + const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const session = await startSession(new ParentAgent(), { llm: llmModel }); + const triggerRun = session.run({ userInput: 'Please run the intro task.' }); + await triggerRun.wait(); + triggerRun.expect.containsFunctionCall({ name: 'collectIntroWithTask' }); + + const answerRun = session.run({ + userInput: "I'm Morgan, and I'm a backend engineer focused on APIs.", + }); + await answerRun.wait(); + + expect(runIntroTaskCalls).toBe(1); + expect(recordIntroToolCalls).toBeGreaterThanOrEqual(1); + expect(introTaskResult).toBeDefined(); + expect(introTaskResult!.name.toLowerCase()).toContain('morgan'); + expect(introTaskResult!.intro.toLowerCase()).toMatch(/backend|api/); + }); + + itIfOpenAI( + 'LLM-powered GetEmailTask (python workflow parity) captures email in AgentTask', + async () => { + let capturedEmail = ''; + let runEmailTaskCalls = 0; + let updateEmailToolCalls = 0; + + // Ref: python livekit-agents/livekit/agents/beta/workflows/email_address.py - 27-131 lines. + class GetEmailTask extends voice.AgentTask { + constructor() { + super({ + instructions: + 'You are responsible only for capturing an email address. ' + + 'Extract the email from the latest user message and call updateEmailAddress exactly once.', + tools: { + updateEmailAddress: llm.tool({ + description: 'Store the user email address and complete the task.', + parameters: z.object({ + email: z.string().describe('The user email address'), + }), + execute: async ({ email }) => { + updateEmailToolCalls += 1; + const normalized = email + .trim() + .toLowerCase() + .replace(/[.,!?;:]+$/g, ''); + this.complete(normalized); + return `Email captured: ${normalized}`; + }, + }), + }, + }); + } + + async onEnter() { + this.session.generateReply({ + instructions: + 'Ask for the email briefly if needed, then call updateEmailAddress after receiving it.', + }); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ + instructions: + 'When user asks to capture email via task, ALWAYS call collectEmailWithTask exactly once.', + tools: { + collectEmailWithTask: llm.tool({ + description: 'Run GetEmailTask and return the captured email.', + parameters: z.object({}), + execute: async () => { + runEmailTaskCalls += 1; + const result = await new GetEmailTask().run(); + capturedEmail = result; + return result; + }, + }), + }, + }); + } + } + + const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const session = await startSession(new ParentAgent(), { llm: llmModel }); + const triggerRun = session.run({ userInput: 'Please capture my email with the task.' }); + await triggerRun.wait(); + triggerRun.expect.containsFunctionCall({ name: 'collectEmailWithTask' }); + + const answerRun = session.run({ userInput: 'My email is jordan.smith@example.com.' }); + await answerRun.wait(); + + expect(runEmailTaskCalls).toBe(1); + expect(updateEmailToolCalls).toBeGreaterThanOrEqual(1); + expect(capturedEmail).toBe('jordan.smith@example.com'); + }, + ); + + it('agent calls a task in onExit', async () => { + const done = new Future(); + let oldAgentOnEnterCount = 0; + + class ExitTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Return on-exit marker.' }); + } + + async onEnter() { + this.complete('exit-task-finished'); + } + } + + class OldAgent extends voice.Agent { + constructor() { + super({ instructions: 'Old agent that runs an AgentTask in onExit.' }); + } + + async onEnter() { + oldAgentOnEnterCount += 1; + } + + async onExit() { + if (done.done) { + return; + } + try { + const result = await new ExitTask().run(); + done.resolve(result); + } catch (error) { + done.reject(asError(error)); + } + } + } + + const oldAgent = new OldAgent(); + const session = await startSession(oldAgent); + const currentActivity = (session as any).activity as { + createSpeechTask: (options: { + taskFn: () => Promise; + inlineTask?: boolean; + name?: string; + }) => { result: Promise }; + }; + // Non-parity note: Python/JS both hold the session activity lock while draining on updateAgent, + // so invoking AgentTask.run() from onExit during that lock path can deadlock. + // This harness triggers onExit via an inline speech task outside updateAgent's lock scope + // to validate AgentTask behavior in onExit itself (the scenario requested by this test). + await currentActivity.createSpeechTask({ + taskFn: async () => oldAgent.onExit(), + inlineTask: true, + name: 'AgentActivity_onExit_testHarness', + }).result; + + await expect(done.await).resolves.toBe('exit-task-finished'); + expect(oldAgentOnEnterCount).toBe(1); + expect((session as any).agent).toBe(oldAgent); + }); + + it('AgentTask instance is non-reentrant (edge case)', async () => { + const done = new Future<{ first: string; secondRunError: string }>(); + + class SingleUseTask extends voice.AgentTask { + constructor() { + super({ instructions: 'Single-use AgentTask edge case.' }); + } + + async onEnter() { + this.complete('ok'); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ instructions: 'Agent validating AgentTask re-entrancy behavior.' }); + } + + async onEnter() { + try { + const task = new SingleUseTask(); + const first = await task.run(); + let secondRunError = ''; + + try { + await task.run(); + } catch (error) { + secondRunError = error instanceof Error ? error.message : String(error); + } + + done.resolve({ first, secondRunError }); + } catch (error) { + done.reject(asError(error)); + } + } + } + + await startSession(new ParentAgent()); + const result = await done.await; + expect(result.first).toBe('ok'); + expect(result.secondRunError).toContain('cannot be awaited multiple times'); + }); +}); From 393a76a9489ea06c483e9fa33aff07d75242bb7c Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Wed, 11 Feb 2026 18:48:23 -0800 Subject: [PATCH 04/21] Update agent_session.ts --- agents/src/voice/agent_session.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 64bfabeff..942e65187 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -660,8 +660,6 @@ export class AgentSession< await prevActivityObj.drain(); await prevActivityObj.close(); } - } else { - throw new Error('Invalid agent activity transition'); } this.activity = this.nextActivity; From e347bba66cec52472b3601ef8878595a7b61fb8b Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Wed, 11 Feb 2026 19:28:34 -0800 Subject: [PATCH 05/21] fix deferred audio stream --- agents/src/stream/deferred_stream.ts | 23 +++++++++++++++++------ agents/src/voice/agent_activity.ts | 13 ++++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/agents/src/stream/deferred_stream.ts b/agents/src/stream/deferred_stream.ts index 71a10c7e8..d1e09b9ce 100644 --- a/agents/src/stream/deferred_stream.ts +++ b/agents/src/stream/deferred_stream.ts @@ -59,16 +59,17 @@ export class DeferredReadableStream { throw new Error('Stream source already set'); } - this.sourceReader = source.getReader(); - this.pump(); + const sourceReader = source.getReader(); + this.sourceReader = sourceReader; + void this.pump(sourceReader); } - private async pump() { + private async pump(sourceReader: ReadableStreamDefaultReader) { let sourceError: unknown; try { while (true) { - const { done, value } = await this.sourceReader!.read(); + const { done, value } = await sourceReader.read(); if (done) break; await this.writer.write(value); } @@ -81,7 +82,7 @@ export class DeferredReadableStream { // any other error from source will be propagated to the consumer if (sourceError) { try { - this.writer.abort(sourceError); + await this.writer.abort(sourceError); } catch (e) { // ignore if writer is already closed } @@ -118,10 +119,20 @@ export class DeferredReadableStream { return; } + const sourceReader = this.sourceReader!; + // Clear source first so future setSource() calls can reattach cleanly. + this.sourceReader = undefined; + // release lock will make any pending read() throw TypeError // which are expected, and we intentionally catch those error // using isStreamReaderReleaseError // this will unblock any pending read() inside the async for loop - this.sourceReader!.releaseLock(); + try { + sourceReader.releaseLock(); + } catch (e) { + if (!isStreamReaderReleaseError(e)) { + throw e; + } + } } } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index cff6f4181..0343a9d58 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -441,9 +441,12 @@ export class AgentActivity implements RecognitionHooks { } attachAudioInput(audioStream: ReadableStream): void { - if (this.audioStream.isSourceSet) { + const currentDeferredStream = this.audioStream; + if (currentDeferredStream.isSourceSet) { this.logger.debug('detaching existing audio input in agent activity'); - this.audioStream.detachSource(); + void currentDeferredStream.detachSource().catch((error) => { + this.logger.debug({ error }, 'error detaching existing audio input in agent activity'); + }); } /** @@ -452,6 +455,8 @@ export class AgentActivity implements RecognitionHooks { * This is important because teeing the original stream directly makes it very difficult—if not * impossible—to implement stream unlock logic cleanly. */ + // Recreate the deferred stream each attach because tee() locks the underlying readable stream. + this.audioStream = new DeferredReadableStream(); this.audioStream.setSource(audioStream); const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee(); @@ -465,7 +470,9 @@ export class AgentActivity implements RecognitionHooks { } detachAudioInput(): void { - this.audioStream.detachSource(); + void this.audioStream.detachSource().catch((error) => { + this.logger.debug({ error }, 'error detaching audio input in agent activity'); + }); } commitUserTurn( From f92fa9fbf0db8b2a21df21ffc3c2652b080534aa Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Thu, 12 Feb 2026 14:44:01 -0800 Subject: [PATCH 06/21] Create curvy-coins-boil.md --- .changeset/curvy-coins-boil.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .changeset/curvy-coins-boil.md diff --git a/.changeset/curvy-coins-boil.md b/.changeset/curvy-coins-boil.md new file mode 100644 index 000000000..07f78b1b9 --- /dev/null +++ b/.changeset/curvy-coins-boil.md @@ -0,0 +1,9 @@ +--- +"@livekit/agents": patch +"@livekit/agents-plugin-google": patch +"@livekit/agents-plugin-livekit": patch +"@livekit/agents-plugin-openai": patch +"livekit-agents-examples": patch +--- + +Implement AgentTask feature From 8eca1abad56c7a6838a7f35e4ede8c3fcd075b27 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Thu, 12 Feb 2026 14:56:58 -0800 Subject: [PATCH 07/21] Update multilingual.ts --- plugins/livekit/src/turn_detector/multilingual.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts index 592f1c7c4..318e72134 100644 --- a/plugins/livekit/src/turn_detector/multilingual.ts +++ b/plugins/livekit/src/turn_detector/multilingual.ts @@ -135,12 +135,11 @@ export class MultilingualModel extends EOUModel { } function remoteInferenceUrl() { - return undefined; - // const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; - // if (!urlBase) { - // return undefined; - // } - // return `${urlBase}/eot/multi`; + const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; + if (!urlBase) { + return undefined; + } + return `${urlBase}/eot/multi`; } export default EUORunnerMultilingual; From 4a813dbc85707c9c26346b0e2b13241f29328025 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Thu, 12 Feb 2026 15:52:57 -0800 Subject: [PATCH 08/21] save --- examples/src/{agent_task_survey.ts => basic_agent_task.ts} | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) rename examples/src/{agent_task_survey.ts => basic_agent_task.ts} (95%) diff --git a/examples/src/agent_task_survey.ts b/examples/src/basic_agent_task.ts similarity index 95% rename from examples/src/agent_task_survey.ts rename to examples/src/basic_agent_task.ts index b00caa3e7..09a6b2153 100644 --- a/examples/src/agent_task_survey.ts +++ b/examples/src/basic_agent_task.ts @@ -42,7 +42,9 @@ class IntroTask extends voice.AgentTask { } async onEnter() { - this.session.generateReply(); + this.session.generateReply({ + userInput: 'Greet user and ask the user for their name and role', + }); } } From 26f0d7110ae48e80ed07f1f442b990827b562238 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Thu, 12 Feb 2026 21:49:16 -0800 Subject: [PATCH 09/21] improve task + test utils --- agents/src/voice/agent.ts | 2 +- agents/src/voice/agent_session.ts | 18 ++++- agents/src/voice/testing/run_result.ts | 7 ++ examples/src/basic_agent_task.ts | 4 +- examples/src/testing/agent_task.test.ts | 87 +++++++++++++++++++++++++ examples/src/testing/run_result.test.ts | 5 +- 6 files changed, 117 insertions(+), 6 deletions(-) diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index 630c4e440..0aa1f0733 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -513,7 +513,7 @@ export class AgentTask extends Agent): Promise { + async run(): Promise { if (this.started) { throw new Error( `Task ${this.constructor.name} has already started and cannot be awaited multiple times`, diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 942e65187..4d2822e74 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -618,7 +618,23 @@ export class AgentSession< }); this._globalRunState = runState; - this.generateReply({ userInput }); + + // Defer generateReply through the activityLock to ensure any in-progress + // activity transition (e.g. AgentTask started from onEnter) completes first. + // Unlike Python's asyncio.create_task (which defers onEnter to the event loop), + // TS Task.from starts onEnter synchronously, so the transition may already be + // mid-flight by the time run() is called after session.start() resolves. + // Acquiring and immediately releasing the lock guarantees FIFO ordering: + // the transition's lock section finishes before we route generateReply. + (async () => { + try { + const unlock = await this.activityLock.lock(); + unlock(); + this.generateReply({ userInput }); + } catch (e) { + runState._reject(e instanceof Error ? e : new Error(String(e))); + } + })(); return runState; } diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts index 6887c500c..e0c318241 100644 --- a/agents/src/voice/testing/run_result.ts +++ b/agents/src/voice/testing/run_result.ts @@ -208,6 +208,13 @@ export class RunResult { } } + /** @internal – Reject the run with an error (e.g. when deferred generateReply fails). */ + _reject(error: Error): void { + if (!this.doneFut.done) { + this.doneFut.reject(error); + } + } + /** @internal */ _markDoneIfNeeded(handle?: SpeechHandle | Task | null): void { if (isSpeechHandle(handle)) { diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index 09a6b2153..8e1a2038c 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -57,8 +57,8 @@ class SurveyAgent extends voice.Agent { } async onEnter() { - // Ref: python examples/survey/survey_agent.py - 284-327 lines. - const result = await new IntroTask().run(this.session); + const task = new IntroTask(); + const result = await task.run(); await this.session.say( `Great to meet you ${result.name}. I noted your role as ${result.role}. We can continue now.`, { addToChatCtx: true }, diff --git a/examples/src/testing/agent_task.test.ts b/examples/src/testing/agent_task.test.ts index f8e6820de..cfb10612f 100644 --- a/examples/src/testing/agent_task.test.ts +++ b/examples/src/testing/agent_task.test.ts @@ -8,6 +8,25 @@ import { z } from 'zod'; initializeLogger({ pretty: true, level: 'warn' }); +/** + * AgentTask scenario coverage: + * + * 1. Agent -> onEnter -> AgentTask -> onEnter -> self.complete + * COVERED: "agent calls a task in onEnter" (WelcomeTask) + * + * 2. Agent -> onEnter -> AgentTask -> onEnter -> generateReply -> User -> Tool -> self.complete + * NOT TESTABLE: session.run() rejects with "speech scheduling draining" when task is started + * from onEnter. Works in production (basic_agent_task.ts) with real voice/STT. + * Tool-triggered variant COVERED: "LLM-powered IntroTask", "LLM-powered GetEmailTask" + * + * 3. Agent -> Tool Call -> AgentTask -> User message -> Tool Call -> self.complete + * COVERED: "LLM-powered IntroTask", "LLM-powered GetEmailTask" + * + * 4. Agent -> Tool handoff -> onExit -> AgentTask -> self.complete -> handoff target + * DEADLOCK: AgentTask.run() from onExit during updateAgent transition holds activity lock. + * onExit + AgentTask COVERED via harness: "agent calls a task in onExit" (createSpeechTask). + */ + function asError(error: unknown): Error { return error instanceof Error ? error : new Error(String(error)); } @@ -112,6 +131,69 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { const itIfOpenAI = process.env.OPENAI_API_KEY ? it : it.skip; + // Scenario 2: Agent onEnter -> AgentTask -> onEnter -> generateReply -> User -> Tool -> self.complete + itIfOpenAI( + 'scenario 2: onEnter AgentTask with generateReply then user input via run()', + async () => { + const done = new Future<{ name: string; role: string }>(); + + class IntroTask extends voice.AgentTask<{ name: string; role: string }> { + constructor() { + super({ + instructions: + 'You are collecting a name and role. Extract both from user input and call recordIntro.', + tools: { + recordIntro: llm.tool({ + description: 'Record the name and role', + parameters: z.object({ + name: z.string().describe('User name'), + role: z.string().describe('User role'), + }), + execute: async ({ name, role }) => { + this.complete({ name, role }); + return 'recorded'; + }, + }), + }, + }); + } + + async onEnter() { + this.session.generateReply({ + instructions: 'Ask the user for their name and role.', + }); + } + } + + class ParentAgent extends voice.Agent { + constructor() { + super({ instructions: 'Parent agent that launches IntroTask on enter.' }); + } + + async onEnter() { + try { + const result = await new IntroTask().run(); + done.resolve(result); + } catch (error) { + done.reject(asError(error)); + } + } + } + + const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const session = await startSession(new ParentAgent(), { llm: llmModel }); + + const result = session.run({ + userInput: "I'm Sam and I'm a frontend engineer.", + }); + await result.wait(); + + const taskResult = await done.await; + expect(taskResult.name.toLowerCase()).toContain('sam'); + expect(taskResult.role.toLowerCase()).toMatch(/frontend/); + }, + ); + itIfOpenAI( 'agent calls a task in a tool; resuming previous activity does not execute onEnter again', async () => { @@ -326,6 +408,11 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { }, ); + // Scenario: Agent -> Tool handoff -> onExit -> AgentTask -> self.complete -> handoff target + // Known to deadlock: AgentTask.run() from onExit during updateAgent/handoff transition holds + // the activity lock. Use createSpeechTask harness (see "agent calls a task in onExit") to run + // AgentTask in onExit outside the handoff path instead. + it('agent calls a task in onExit', async () => { const done = new Future(); let oldAgentOnEnterCount = 0; diff --git a/examples/src/testing/run_result.test.ts b/examples/src/testing/run_result.test.ts index 66af0e322..583cbaffa 100644 --- a/examples/src/testing/run_result.test.ts +++ b/examples/src/testing/run_result.test.ts @@ -230,8 +230,9 @@ describe('RunResult', { timeout: 120_000 }, () => { const result = session.run({ userInput: "What's the weather in London?" }); await result.wait(); - // Skip function_call and function_call_output - result.expect.skipNext(2); + // Skip all events except the last (assistant message); LLM may emit 1+ function_call pairs + const n = result.events.length; + result.expect.skipNext(n - 1); result.expect.nextEvent().isMessage({ role: 'assistant' }); result.expect.noMoreEvents(); }); From 99491e88406358ebdc5e312e0ea48d3b2721fbdd Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:18:29 -0800 Subject: [PATCH 10/21] refine tests and fix edge cases --- agents/src/voice/agent.ts | 11 +- agents/src/voice/testing/run_result.ts | 8 +- examples/src/testing/agent_task.test.ts | 241 ++++++------------------ 3 files changed, 70 insertions(+), 190 deletions(-) diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index 0aa1f0733..06a59e8eb 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -574,13 +574,16 @@ export class AgentTask extends Agent 1) { + runState._unwatchHandle(speechHandle); + } // it is OK to call _markDoneIfNeeded here, the above _updateActivity will call onEnter - // so handles added inside the onEnter will make sure we're not completing the runState too early. + // and newly added handles keep the run alive. runState._markDoneIfNeeded(); } diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts index e0c318241..cc8cb026a 100644 --- a/agents/src/voice/testing/run_result.ts +++ b/agents/src/voice/testing/run_result.ts @@ -208,6 +208,11 @@ export class RunResult { } } + /** @internal */ + _watchedHandleCount(): number { + return this.handles.size; + } + /** @internal – Reject the run with an error (e.g. when deferred generateReply fails). */ _reject(error: Error): void { if (!this.doneFut.done) { @@ -221,7 +226,8 @@ export class RunResult { this.lastSpeechHandle = handle; } - if ([...this.handles].every((h) => (isSpeechHandle(h) ? h.done() : h.done))) { + const allDone = [...this.handles].every((h) => (isSpeechHandle(h) ? h.done() : h.done)); + if (allDone) { this._markDone(); } } diff --git a/examples/src/testing/agent_task.test.ts b/examples/src/testing/agent_task.test.ts index cfb10612f..03c3ecdb9 100644 --- a/examples/src/testing/agent_task.test.ts +++ b/examples/src/testing/agent_task.test.ts @@ -24,13 +24,31 @@ initializeLogger({ pretty: true, level: 'warn' }); * * 4. Agent -> Tool handoff -> onExit -> AgentTask -> self.complete -> handoff target * DEADLOCK: AgentTask.run() from onExit during updateAgent transition holds activity lock. - * onExit + AgentTask COVERED via harness: "agent calls a task in onExit" (createSpeechTask). + * NOT COVERED in this suite due to known deadlock limitation. */ function asError(error: unknown): Error { return error instanceof Error ? error : new Error(String(error)); } +async function withFutureResolution(done: Future, fn: () => Promise): Promise { + try { + done.resolve(await fn()); + } catch (error) { + done.reject(asError(error)); + } +} + +function createOpenAILLM(): openai.LLM { + return new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); +} + +async function runAndWait(session: voice.AgentSession, userInput: string) { + const result = session.run({ userInput }); + await result.wait(); + return result; +} + describe('AgentTask examples', { timeout: 120_000 }, () => { const sessions: voice.AgentSession[] = []; @@ -49,7 +67,6 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { it('agent calls a task in onEnter', async () => { const done = new Future(); - // Ref: python livekit-agents/livekit/agents/voice/agent.py - 739-841 lines. class WelcomeTask extends voice.AgentTask { constructor() { super({ instructions: 'Collect a welcome token and finish quickly.' }); @@ -66,12 +83,7 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } async onEnter() { - try { - const result = await new WelcomeTask().run(); - done.resolve(result); - } catch (error) { - done.reject(asError(error)); - } + await withFutureResolution(done, async () => new WelcomeTask().run()); } } @@ -108,16 +120,14 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } async onEnter() { - try { + await withFutureResolution(done, async () => { const order: string[] = []; const first = await new FirstTask().run(); order.push('first'); const second = await new SecondTask().run(); order.push('second'); - done.resolve({ first, second, order }); - } catch (error) { - done.reject(asError(error)); - } + return { first, second, order }; + }); } } @@ -171,26 +181,25 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } async onEnter() { - try { - const result = await new IntroTask().run(); - done.resolve(result); - } catch (error) { - done.reject(asError(error)); - } + await withFutureResolution(done, async () => new IntroTask().run()); } } - const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const llmModel = createOpenAILLM(); const session = await startSession(new ParentAgent(), { llm: llmModel }); - const result = session.run({ - userInput: "I'm Sam and I'm a frontend engineer.", - }); - await result.wait(); + let result = await runAndWait(session, "I'm Sam and I'm a frontend engineer."); const taskResult = await done.await; + result.expect.containsFunctionCall({ name: 'recordIntro' }); expect(taskResult.name.toLowerCase()).toContain('sam'); expect(taskResult.role.toLowerCase()).toMatch(/frontend/); + + result = await runAndWait(session, 'What is my name and role?'); + result.expect + .nextEvent() + .isMessage({ role: 'assistant' }) + .judge(llmModel, { intent: 'should answer name as Sam and role as frontend engineer' }); }, ); @@ -240,26 +249,30 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } } - const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const llmModel = createOpenAILLM(); const session = await startSession(new ToolAgent(), { llm: llmModel }); - const result = session.run({ userInput: 'Please capture my email using your tool.' }); - await result.wait(); + let result = await runAndWait(session, 'Please capture my email using your tool.'); result.expect.containsFunctionCall({ name: 'captureEmail' }); + result.expect.containsAgentHandoff({ newAgentType: GetEmailAddressTask }); + result.expect.containsFunctionCallOutput({ + isError: false, + }); + result.expect.containsMessage({ role: 'assistant' }).judge(llmModel, { + intent: 'should answer email captured, not necessarily need to state the email address', + }); expect(toolCallCount).toBe(1); expect(taskOnEnterCount).toBe(1); - // Critical parity check: resume path must not run parent onEnter again. expect(parentOnEnterCount).toBe(1); }, ); - itIfOpenAI('LLM-powered IntroTask (python survey parity) records intro details', async () => { + itIfOpenAI('IntroTask records intro details', async () => { let introTaskResult: { name: string; intro: string } | undefined; let runIntroTaskCalls = 0; let recordIntroToolCalls = 0; - // Ref: python examples/survey/survey_agent.py - 248-274 lines. class IntroTask extends voice.AgentTask<{ name: string; intro: string }> { constructor() { super({ @@ -312,16 +325,19 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } } - const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); + const llmModel = createOpenAILLM(); const session = await startSession(new ParentAgent(), { llm: llmModel }); - const triggerRun = session.run({ userInput: 'Please run the intro task.' }); - await triggerRun.wait(); + const triggerRun = await runAndWait(session, 'Please run the intro task.'); triggerRun.expect.containsFunctionCall({ name: 'collectIntroWithTask' }); - - const answerRun = session.run({ - userInput: "I'm Morgan, and I'm a backend engineer focused on APIs.", + triggerRun.expect.containsMessage({ role: 'assistant' }).judge(llmModel, { + intent: 'Ask the user for name and intro', }); - await answerRun.wait(); + + const answerRun = await runAndWait( + session, + "I'm Morgan, and I'm a backend engineer focused on APIs.", + ); + answerRun.expect.containsAgentHandoff({ newAgentType: ParentAgent }); expect(runIntroTaskCalls).toBe(1); expect(recordIntroToolCalls).toBeGreaterThanOrEqual(1); @@ -330,149 +346,6 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { expect(introTaskResult!.intro.toLowerCase()).toMatch(/backend|api/); }); - itIfOpenAI( - 'LLM-powered GetEmailTask (python workflow parity) captures email in AgentTask', - async () => { - let capturedEmail = ''; - let runEmailTaskCalls = 0; - let updateEmailToolCalls = 0; - - // Ref: python livekit-agents/livekit/agents/beta/workflows/email_address.py - 27-131 lines. - class GetEmailTask extends voice.AgentTask { - constructor() { - super({ - instructions: - 'You are responsible only for capturing an email address. ' + - 'Extract the email from the latest user message and call updateEmailAddress exactly once.', - tools: { - updateEmailAddress: llm.tool({ - description: 'Store the user email address and complete the task.', - parameters: z.object({ - email: z.string().describe('The user email address'), - }), - execute: async ({ email }) => { - updateEmailToolCalls += 1; - const normalized = email - .trim() - .toLowerCase() - .replace(/[.,!?;:]+$/g, ''); - this.complete(normalized); - return `Email captured: ${normalized}`; - }, - }), - }, - }); - } - - async onEnter() { - this.session.generateReply({ - instructions: - 'Ask for the email briefly if needed, then call updateEmailAddress after receiving it.', - }); - } - } - - class ParentAgent extends voice.Agent { - constructor() { - super({ - instructions: - 'When user asks to capture email via task, ALWAYS call collectEmailWithTask exactly once.', - tools: { - collectEmailWithTask: llm.tool({ - description: 'Run GetEmailTask and return the captured email.', - parameters: z.object({}), - execute: async () => { - runEmailTaskCalls += 1; - const result = await new GetEmailTask().run(); - capturedEmail = result; - return result; - }, - }), - }, - }); - } - } - - const llmModel = new openai.LLM({ model: 'gpt-4o-mini', temperature: 0 }); - const session = await startSession(new ParentAgent(), { llm: llmModel }); - const triggerRun = session.run({ userInput: 'Please capture my email with the task.' }); - await triggerRun.wait(); - triggerRun.expect.containsFunctionCall({ name: 'collectEmailWithTask' }); - - const answerRun = session.run({ userInput: 'My email is jordan.smith@example.com.' }); - await answerRun.wait(); - - expect(runEmailTaskCalls).toBe(1); - expect(updateEmailToolCalls).toBeGreaterThanOrEqual(1); - expect(capturedEmail).toBe('jordan.smith@example.com'); - }, - ); - - // Scenario: Agent -> Tool handoff -> onExit -> AgentTask -> self.complete -> handoff target - // Known to deadlock: AgentTask.run() from onExit during updateAgent/handoff transition holds - // the activity lock. Use createSpeechTask harness (see "agent calls a task in onExit") to run - // AgentTask in onExit outside the handoff path instead. - - it('agent calls a task in onExit', async () => { - const done = new Future(); - let oldAgentOnEnterCount = 0; - - class ExitTask extends voice.AgentTask { - constructor() { - super({ instructions: 'Return on-exit marker.' }); - } - - async onEnter() { - this.complete('exit-task-finished'); - } - } - - class OldAgent extends voice.Agent { - constructor() { - super({ instructions: 'Old agent that runs an AgentTask in onExit.' }); - } - - async onEnter() { - oldAgentOnEnterCount += 1; - } - - async onExit() { - if (done.done) { - return; - } - try { - const result = await new ExitTask().run(); - done.resolve(result); - } catch (error) { - done.reject(asError(error)); - } - } - } - - const oldAgent = new OldAgent(); - const session = await startSession(oldAgent); - const currentActivity = (session as any).activity as { - createSpeechTask: (options: { - taskFn: () => Promise; - inlineTask?: boolean; - name?: string; - }) => { result: Promise }; - }; - // Non-parity note: Python/JS both hold the session activity lock while draining on updateAgent, - // so invoking AgentTask.run() from onExit during that lock path can deadlock. - // This harness triggers onExit via an inline speech task outside updateAgent's lock scope - // to validate AgentTask behavior in onExit itself (the scenario requested by this test). - await currentActivity.createSpeechTask({ - taskFn: async () => oldAgent.onExit(), - inlineTask: true, - name: 'AgentActivity_onExit_testHarness', - }).result; - - await expect(done.await).resolves.toBe('exit-task-finished'); - expect(oldAgentOnEnterCount).toBe(1); - expect((session as any).agent).toBe(oldAgent); - }); - it('AgentTask instance is non-reentrant (edge case)', async () => { const done = new Future<{ first: string; secondRunError: string }>(); @@ -492,7 +365,7 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { } async onEnter() { - try { + await withFutureResolution(done, async () => { const task = new SingleUseTask(); const first = await task.run(); let secondRunError = ''; @@ -503,10 +376,8 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { secondRunError = error instanceof Error ? error.message : String(error); } - done.resolve({ first, secondRunError }); - } catch (error) { - done.reject(asError(error)); - } + return { first, secondRunError }; + }); } } From 29d2f0f2a40e0d120ac711a2fde46ca743d2d726 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:24:34 -0800 Subject: [PATCH 11/21] Update speech_handle.ts --- agents/src/voice/speech_handle.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agents/src/voice/speech_handle.ts b/agents/src/voice/speech_handle.ts index b5e241f40..a3cde5aa6 100644 --- a/agents/src/voice/speech_handle.ts +++ b/agents/src/voice/speech_handle.ts @@ -170,6 +170,10 @@ export class SpeechHandle { } addDoneCallback(callback: (sh: SpeechHandle) => void) { + if (this.done()) { + queueMicrotask(() => callback(this)); + return; + } this.doneCallbacks.add(callback); } From 828a9b8b83e24d18aa7239662ce0fd009f0cc11a Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:26:00 -0800 Subject: [PATCH 12/21] cleanup & lint --- agents/src/voice/agent_activity.ts | 2 -- agents/src/voice/agent_session.ts | 3 +-- examples/src/testing/agent_task.test.ts | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 0343a9d58..117ab05b6 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -123,7 +123,6 @@ export class AgentActivity implements RecognitionHooks { _onExitTask?: Task; _userTurnCompletedTask?: Task; - // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 703-739 lines. private readonly onRealtimeGenerationCreated = (ev: GenerationCreatedEvent) => this.onGenerationCreated(ev); private readonly onRealtimeInputSpeechStarted = (ev: InputSpeechStartedEvent) => @@ -2472,7 +2471,6 @@ export class AgentActivity implements RecognitionHooks { try { if (this._schedulingPaused) return; - // Ref: python agent_activity.py 629-632 this._onExitTask = this.createSpeechTask({ taskFn: () => tracer.startActiveSpan(async () => this.agent.onExit(), { diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 4d2822e74..10d638368 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -8,7 +8,7 @@ import type { Context, Span } from '@opentelemetry/api'; import { ROOT_CONTEXT, context as otelContext, trace } from '@opentelemetry/api'; import { EventEmitter } from 'node:events'; import type { ReadableStream } from 'node:stream/web'; -import { z } from 'zod'; +import type { z } from 'zod'; import { LLM as InferenceLLM, STT as InferenceSTT, @@ -501,7 +501,6 @@ export class AgentSession< throw new Error('AgentSession is not running'); } - // Ref: python agent_session.py 907-927 const doSay = (activity: AgentActivity, nextActivity?: AgentActivity) => { if (activity.schedulingPaused) { if (!nextActivity) { diff --git a/examples/src/testing/agent_task.test.ts b/examples/src/testing/agent_task.test.ts index 03c3ecdb9..636a8e21d 100644 --- a/examples/src/testing/agent_task.test.ts +++ b/examples/src/testing/agent_task.test.ts @@ -251,7 +251,7 @@ describe('AgentTask examples', { timeout: 120_000 }, () => { const llmModel = createOpenAILLM(); const session = await startSession(new ToolAgent(), { llm: llmModel }); - let result = await runAndWait(session, 'Please capture my email using your tool.'); + const result = await runAndWait(session, 'Please capture my email using your tool.'); result.expect.containsFunctionCall({ name: 'captureEmail' }); result.expect.containsAgentHandoff({ newAgentType: GetEmailAddressTask }); From 6dcf791735019239ff6a3c96143fd4672981b6ef Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:27:45 -0800 Subject: [PATCH 13/21] cleanup --- agents/src/voice/README.md | 224 ------------------------------ agents/src/voice/agent_session.ts | 1 - 2 files changed, 225 deletions(-) delete mode 100644 agents/src/voice/README.md diff --git a/agents/src/voice/README.md b/agents/src/voice/README.md deleted file mode 100644 index 15e1b2e6b..000000000 --- a/agents/src/voice/README.md +++ /dev/null @@ -1,224 +0,0 @@ -# AgentTask Runtime Flow (Python Reference) - -This document explains how Python `AgentTask` works at runtime so you can read related code and trace behavior confidently. - -Primary reference files: - -- `livekit-agents/livekit/agents/voice/agent.py` -- `livekit-agents/livekit/agents/voice/agent_session.py` -- `livekit-agents/livekit/agents/voice/agent_activity.py` -- `livekit-agents/livekit/agents/voice/generation.py` -- `livekit-agents/livekit/agents/llm/chat_context.py` -- `livekit-agents/livekit/agents/beta/workflows/task_group.py` - ---- - -## 1) Mental model - -`AgentTask[T]` is a temporary, awaitable sub-agent that: - -1. pauses the currently active agent activity, -2. runs its own activity (`on_enter`, LLM/tools, speech), -3. resolves a typed result via `complete(...)`, -4. resumes the caller activity, -5. merges useful chat history back to the caller. - -Treat it as an inline conversational coroutine that borrows the session and returns. - ---- - -## 2) Two activity transition modes (must distinguish) - -### Mode A: close/start (normal handoff) - -Used by `session.update_agent(new_agent)` and handoff returns. - -- old activity: `drain()` + `aclose()` -- new activity: `start()` -- no implicit return to the old agent - -### Mode B: pause/resume (inline AgentTask) - -Used by `await some_task`. - -- old activity: `pause()` (kept alive) -- task activity: `start()` -- task finishes (`complete(...)`) -- task activity: `drain()` + `aclose()` -- old activity: `resume()` - -`AgentTask` relies on Mode B. - ---- - -## 3) End-to-end sequence - -```mermaid -sequenceDiagram - participant callerAgent as CallerAgent - participant agentTask as AgentTask - participant session as AgentSession - participant callerActivity as CallerActivity - participant taskActivity as TaskActivity - - callerAgent->>agentTask: await task - agentTask->>session: _update_activity(task, previous_activity="pause") - session->>callerActivity: pause() - session->>taskActivity: start() - taskActivity->>agentTask: on_enter() - Note over taskActivity: LLM/tools/speech execution - agentTask->>agentTask: complete(result_or_exception) - agentTask-->>callerAgent: await returns or raises - agentTask->>session: _update_activity(old_agent, new_activity="resume") - session->>taskActivity: drain()+aclose() - session->>callerActivity: resume() -``` - ---- - -## 4) What `AgentTask` adds over `Agent` - -In `agent.py`, `AgentTask` extends `Agent` and introduces: - -- internal future (`__fut`) to represent task completion, -- non-reentrancy guard (`__started`), -- `done()` state, -- `complete(value_or_exception)`, -- `__await__`/`__await_impl`. - -Without `complete(...)`, `await task` never resolves. - ---- - -## 5) `__await_impl` control flow (core) - -When caller does `result = await task`: - -1. validate usage context and reentrancy, -2. capture old activity/agent, -3. switch to task activity with `previous_activity="pause"`, -4. await task future, -5. in `finally`, if session is still on this task: - - merge task chat context into old agent chat context - - resume old activity with `new_activity="resume"` - -This `finally` resume logic is the stack-like return behavior. - ---- - -## 6) `complete(...)` semantics - -- `complete(value)` -> awaiter receives `value` -- `complete(exception)` -> awaiter raises - -It also updates current speech-handle final-output plumbing when present. - ---- - -## 7) `AgentSession._update_activity` behavior matrix - -Key params: - -- `previous_activity`: `"close"` | `"pause"` -- `new_activity`: `"start"` | `"resume"` - -Meaning: - -- `close + start`: full handoff to new activity -- `pause + start`: enter inline task -- `close + resume`: return to previously paused activity - -For resume path, an existing `agent._activity` is required. - ---- - -## 8) `AgentActivity.pause()` vs `resume()` - -`pause()`: - -- pauses scheduling/draining logic, -- closes runtime session resources/listeners, -- preserves activity object for later resume. - -`resume()`: - -- re-establishes runtime session resources/listeners, -- restarts scheduling, -- does **not** run `on_enter()` again. - -This is why caller state can continue seamlessly. - ---- - -## 9) Hook execution model - -`on_enter` and `on_exit` run as speech tasks in activity runtime. -They are inline-task-compatible, so nested `await AgentTask(...)` is valid. - ---- - -## 10) Tools, instructions, models during task - -While task is active: - -- instructions are applied like any `Agent`, -- tools are resolved from session + task (+ mcp tools), -- model resolution is task-first, session-fallback. - -So tasks can temporarily override LLM/STT/TTS/VAD/tool behavior. - ---- - -## 11) Chat context merge on return - -On task completion, caller chat context merges task context with rules: - -- dedupe by `id`, -- insert by chronological `created_at`, -- exclude function-call internals, -- exclude instructions (`system`/`developer`) for resume path. - -This preserves useful conversation outcomes without tool noise. - ---- - -## 12) Difference from tool handoff returns - -Tool return handoff (`Agent` return) and `await AgentTask` both switch agents, but: - -- handoff return -> close/start semantics (role transfer), -- `await AgentTask` -> pause/resume semantics (inline subroutine + return). - -Do not conflate these paths while debugging. - ---- - -## 13) Canonical Python usage patterns - -- Survey workflow (`examples/survey/survey_agent.py`): staged typed tasks with `TaskGroup`. -- IVR workflow (`examples/bank-ivr/ivr_system_agent.py`): direct inline `await task` in menu loops. -- `TaskGroup` itself (`beta/workflows/task_group.py`) is implemented on top of `AgentTask`. - ---- - -## 14) Common pitfalls - -- calling `complete(...)` twice -> error, -- awaiting same task instance twice -> error, -- missing `complete(...)` path -> hang, -- concurrent external `update_agent(...)` during task may bypass normal resume path. - ---- - -## 15) Practical tracing checklist - -When reading AgentTask code, confirm: - -1. where task is created, -2. where `await task` happens, -3. where `complete(...)` is guaranteed, -4. whether transition mode is pause/resume vs close/start, -5. how merge filters are configured, -6. whether concurrent handoff can race with task return. - -If all six are clear, your runtime mental simulation is correct. diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 10d638368..f8a48cc4c 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -620,7 +620,6 @@ export class AgentSession< // Defer generateReply through the activityLock to ensure any in-progress // activity transition (e.g. AgentTask started from onEnter) completes first. - // Unlike Python's asyncio.create_task (which defers onEnter to the event loop), // TS Task.from starts onEnter synchronously, so the transition may already be // mid-flight by the time run() is called after session.start() resolves. // Acquiring and immediately releasing the lock guarantees FIFO ordering: From 6cf9d6ae2a585a487ee98bc1e533885331bd0d04 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:28:41 -0800 Subject: [PATCH 14/21] Update run_result.ts --- agents/src/voice/testing/run_result.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/agents/src/voice/testing/run_result.ts b/agents/src/voice/testing/run_result.ts index cc8cb026a..4ee0ccc56 100644 --- a/agents/src/voice/testing/run_result.ts +++ b/agents/src/voice/testing/run_result.ts @@ -249,8 +249,6 @@ export class RunResult { } if (this.outputType) { - // Python does `isinstance(final_output, output_type)`. - // JS uses zod schema validation and takes the parsed value as the typed final output. const result = this.outputType.safeParse(finalOutput); if (!result.success) { this.doneFut.reject( From 1f1033f9df7876e74d42bfca88477706b017d8d9 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 17:30:59 -0800 Subject: [PATCH 15/21] more cleanup --- agents/src/voice/agent_activity.test.ts | 144 ----------------- agents/src/voice/agent_session.test.ts | 171 -------------------- agents/src/voice/testing/run_result.test.ts | 76 --------- 3 files changed, 391 deletions(-) delete mode 100644 agents/src/voice/agent_activity.test.ts delete mode 100644 agents/src/voice/agent_session.test.ts delete mode 100644 agents/src/voice/testing/run_result.test.ts diff --git a/agents/src/voice/agent_activity.test.ts b/agents/src/voice/agent_activity.test.ts deleted file mode 100644 index 26357eea1..000000000 --- a/agents/src/voice/agent_activity.test.ts +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-FileCopyrightText: 2026 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it, vi } from 'vitest'; -import { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; -import { initializeLogger } from '../log.js'; -import { Future, Task } from '../utils.js'; -import { Agent, _setActivityTaskInfo } from './agent.js'; -import { AgentActivity } from './agent_activity.js'; -import { ToolExecutionOutput } from './generation.js'; -import { SpeechHandle } from './speech_handle.js'; - -initializeLogger({ pretty: false, level: 'error' }); - -function createActivityForTests(): AgentActivity { - const agent = new Agent({ instructions: 'test agent' }); - const sessionMock = { - options: { - allowInterruptions: true, - discardAudioIfUninterruptible: true, - minInterruptionDuration: 500, - minInterruptionWords: 0, - minEndpointingDelay: 500, - maxEndpointingDelay: 6000, - maxToolSteps: 3, - preemptiveGeneration: false, - userAwayTimeout: 15, - useTtsAlignedTranscript: true, - }, - turnDetection: undefined, - vad: undefined, - stt: undefined, - llm: undefined, - tts: undefined, - output: { - audio: null, - audioEnabled: false, - }, - rootSpanContext: undefined, - useTtsAlignedTranscript: true, - agentState: 'listening', - emit: vi.fn(), - _updateAgentState: vi.fn(), - _conversationItemAdded: vi.fn(), - _toolItemsAdded: vi.fn(), - updateAgent: vi.fn(), - }; - - return new AgentActivity(agent, sessionMock as any); -} - -describe('AgentActivity parity behaviors', () => { - it('summarizes tool outputs with symmetric function call metadata', () => { - const activity = createActivityForTests(); - const speechHandle = SpeechHandle.create(); - - const toolCall = FunctionCall.create({ - callId: 'call_1', - name: 'lookup', - args: JSON.stringify({ city: 'SF' }), - }); - const toolCallOutput = FunctionCallOutput.create({ - callId: 'call_1', - name: 'lookup', - output: 'sunny', - isError: false, - }); - - const toolOutput = { - output: [ - ToolExecutionOutput.create({ - toolCall, - toolCallOutput, - rawOutput: 'sunny', - replyRequired: true, - }), - ], - firstToolStartedFuture: new Future(), - }; - - const summary = (activity as any).summarizeToolExecutionOutput(toolOutput, speechHandle); - expect(summary.functionToolsExecutedEvent.functionCalls).toHaveLength(1); - expect(summary.functionToolsExecutedEvent.functionCallOutputs).toHaveLength(1); - expect(summary.shouldGenerateToolReply).toBe(true); - expect(summary.newAgentTask).toBeNull(); - expect(summary.ignoreTaskSwitch).toBe(false); - }); - - it('blocks scheduleSpeech while scheduling is paused unless force=true', () => { - const activity = createActivityForTests(); - const handle = SpeechHandle.create(); - - (activity as any)._schedulingPaused = true; - - expect(() => - (activity as any).scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL), - ).toThrow('cannot schedule new speech, the speech scheduling is draining/pausing'); - - expect(() => - (activity as any).scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL, true), - ).not.toThrow(); - }); - - it('filters drain pending tasks by blocked speech handles', async () => { - const activity = createActivityForTests(); - const gate = new Future(); - - const blockedSpeechHandle = SpeechHandle.create(); - const siblingSpeechHandle = blockedSpeechHandle; - - const blockedTask = Task.from(async () => { - await gate.await; - }); - const siblingTask = Task.from(async () => { - await gate.await; - }); - - _setActivityTaskInfo(blockedTask, { speechHandle: blockedSpeechHandle }); - _setActivityTaskInfo(siblingTask, { speechHandle: siblingSpeechHandle }); - - (activity as any).speechTasks = new Set([blockedTask, siblingTask]); - (activity as any)._drainBlockedTasks = [blockedTask]; - (activity as any)._schedulingPaused = true; - - const toWait = (activity as any).getDrainPendingSpeechTasks() as Task[]; - expect(toWait).toEqual([]); - - gate.resolve(); - await Promise.allSettled([blockedTask.result, siblingTask.result]); - }); - - it('interrupt cancels preemptive generation first', () => { - const activity = createActivityForTests(); - const preemptiveSpeech = SpeechHandle.create(); - - (activity as any)._preemptiveGeneration = { speechHandle: preemptiveSpeech } as any; - - const fut = activity.interrupt(); - - expect(preemptiveSpeech.interrupted).toBe(true); - expect((activity as any)._preemptiveGeneration).toBeUndefined(); - expect(fut.done).toBe(true); - }); -}); diff --git a/agents/src/voice/agent_session.test.ts b/agents/src/voice/agent_session.test.ts deleted file mode 100644 index aa3c664a8..000000000 --- a/agents/src/voice/agent_session.test.ts +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-FileCopyrightText: 2026 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it, vi } from 'vitest'; -import { initializeLogger } from '../log.js'; -import { Future, Task } from '../utils.js'; -import { Agent } from './agent.js'; -import { AgentActivity } from './agent_activity.js'; -import { AgentSession } from './agent_session.js'; -import { CloseReason } from './events.js'; -import { SpeechHandle } from './speech_handle.js'; - -initializeLogger({ pretty: false, level: 'error' }); - -describe('AgentSession', () => { - it('serializes updateAgent transitions and watches run-state tasks', async () => { - const session = new AgentSession({}); - const agent1 = new Agent({ instructions: 'agent one' }); - const agent2 = new Agent({ instructions: 'agent two' }); - - (session as any).started = true; - const order: string[] = []; - - let firstCall = true; - (session as any)._updateActivity = vi.fn(async (agent: Agent) => { - order.push(`start:${agent.id}`); - if (firstCall) { - firstCall = false; - await new Promise((resolve) => setTimeout(resolve, 20)); - } - order.push(`end:${agent.id}`); - }); - - const watchHandle = vi.fn(); - (session as any)._globalRunState = { _watchHandle: watchHandle }; - - session.updateAgent(agent1); - session.updateAgent(agent2); - - await ((session as any).updateActivityTask as { result: Promise }).result; - - expect(order).toEqual([ - `start:${agent1.id}`, - `end:${agent1.id}`, - `start:${agent2.id}`, - `end:${agent2.id}`, - ]); - expect(watchHandle).toHaveBeenCalledTimes(2); - }); - - it('routes say() to nextActivity when current activity is paused', () => { - const session = new AgentSession({}); - const handle = SpeechHandle.create(); - - const pausedActivity = { - schedulingPaused: true, - say: vi.fn(() => { - throw new Error('should not call paused activity say()'); - }), - }; - const nextActivity = { - say: vi.fn(() => handle), - }; - - const watchHandle = vi.fn(); - - (session as any).activity = pausedActivity; - (session as any).nextActivity = nextActivity; - (session as any)._globalRunState = { _watchHandle: watchHandle }; - - const result = session.say('hello'); - - expect(result).toBe(handle); - expect(nextActivity.say).toHaveBeenCalledTimes(1); - expect(pausedActivity.say).not.toHaveBeenCalled(); - expect(watchHandle).toHaveBeenCalledWith(handle); - }); - - it('forces interrupt and commits user turn during non-error close', async () => { - const session = new AgentSession({}); - (session as any).started = true; - - const interruptFuture = new Future(); - interruptFuture.resolve(); - - const activity = { - interrupt: vi.fn(() => interruptFuture), - drain: vi.fn(async () => {}), - currentSpeech: { waitForPlayout: vi.fn(async () => {}) }, - commitUserTurn: vi.fn(), - detachAudioInput: vi.fn(), - close: vi.fn(async () => {}), - }; - - (session as any).activity = activity; - await (session as any).closeImplInner(CloseReason.USER_INITIATED, null, false); - - expect(activity.interrupt).toHaveBeenCalledWith({ force: true }); - expect(activity.commitUserTurn).toHaveBeenCalledWith({ - audioDetached: true, - throwIfNotReady: false, - }); - expect(activity.drain).toHaveBeenCalledTimes(1); - expect(activity.close).toHaveBeenCalledTimes(1); - }); - - it('does not commit user turn during error close', async () => { - const session = new AgentSession({}); - (session as any).started = true; - - const interruptFuture = new Future(); - interruptFuture.resolve(); - - const activity = { - interrupt: vi.fn(() => interruptFuture), - drain: vi.fn(async () => {}), - currentSpeech: { waitForPlayout: vi.fn(async () => {}) }, - commitUserTurn: vi.fn(), - detachAudioInput: vi.fn(), - close: vi.fn(async () => {}), - }; - - (session as any).activity = activity; - await (session as any).closeImplInner(CloseReason.ERROR, null, false); - - expect(activity.commitUserTurn).not.toHaveBeenCalled(); - }); - - it('forwards force option through session interrupt()', () => { - const session = new AgentSession({}); - const interruptFuture = new Future(); - const activity = { - interrupt: vi.fn(() => interruptFuture), - }; - - (session as any).activity = activity; - const returned = session.interrupt({ force: true }); - - expect(returned).toBe(interruptFuture); - expect(activity.interrupt).toHaveBeenCalledWith({ force: true }); - }); - - it('honors waitOnEnter by awaiting onEnter task completion', async () => { - const session = new AgentSession({}); - const agent = new Agent({ instructions: 'wait on enter agent' }); - const previousAgent = new Agent({ instructions: 'previous agent' }); - - (session as any).activity = { - agent: previousAgent, - drain: vi.fn(async () => {}), - close: vi.fn(async () => {}), - }; - - const startSpy = vi.spyOn(AgentActivity.prototype, 'start').mockImplementation(async function ( - this: AgentActivity, - ) { - this._onEnterTask = Task.from(async () => { - await new Promise((resolve) => setTimeout(resolve, 20)); - }); - }); - - const startedAt = Date.now(); - await (session as any)._updateActivity(agent, { waitOnEnter: true }); - const elapsed = Date.now() - startedAt; - - expect(startSpy).toHaveBeenCalledTimes(1); - expect(elapsed).toBeGreaterThanOrEqual(15); - - startSpy.mockRestore(); - }); -}); diff --git a/agents/src/voice/testing/run_result.test.ts b/agents/src/voice/testing/run_result.test.ts deleted file mode 100644 index ffb40c315..000000000 --- a/agents/src/voice/testing/run_result.test.ts +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-FileCopyrightText: 2026 LiveKit, Inc. -// -// SPDX-License-Identifier: Apache-2.0 -import { describe, expect, it } from 'vitest'; -import { z } from 'zod'; -import { ChatMessage } from '../../llm/chat_context.js'; -import { initializeLogger } from '../../log.js'; -import { SpeechHandle } from '../speech_handle.js'; -import { RunResult } from './run_result.js'; - -initializeLogger({ pretty: false, level: 'error' }); - -describe('RunResult', () => { - it('removes speech item callback when unwatching a handle', () => { - const result = new RunResult(); - const handle = SpeechHandle.create(); - - result._watchHandle(handle); - result._unwatchHandle(handle); - - const message = ChatMessage.create({ - role: 'assistant', - content: 'hello', - }); - handle._itemAdded([message]); - - expect(result.events).toHaveLength(0); - - // Done callback is removed too, so run should not complete automatically. - handle._markDone(); - expect(result.done()).toBe(false); - - // Mirrors AgentTask.run() calling _markDoneIfNeeded() after unwatch. - result._markDoneIfNeeded(); - expect(result.done()).toBe(true); - }); - - it('exposes finalOutput when output type matches', async () => { - const result = new RunResult({ outputType: z.string() }); - const handle = SpeechHandle.create(); - - result._watchHandle(handle); - handle._maybeRunFinalOutput = 'ok'; - handle._markDone(); - - await result.wait(); - expect(result.finalOutput).toBe('ok'); - }); - - it('rejects run when final output type mismatches expected outputType', async () => { - const result = new RunResult({ outputType: z.number() }); - const handle = SpeechHandle.create(); - - result._watchHandle(handle); - handle._maybeRunFinalOutput = 'not a number'; - handle._markDone(); - - await expect(result.wait()).rejects.toThrow('Expected output matching provided zod schema'); - }); - - it('rejects run when final output is an error', async () => { - const result = new RunResult(); - const handle = SpeechHandle.create(); - - result._watchHandle(handle); - handle._maybeRunFinalOutput = new Error('boom'); - handle._markDone(); - - await expect(result.wait()).rejects.toThrow('boom'); - }); - - it('throws when accessing finalOutput before completion', () => { - const result = new RunResult(); - expect(() => result.finalOutput).toThrow('cannot retrieve finalOutput, RunResult is not done'); - }); -}); From 5f2173f74669d976935cf328292ba8e67c6c22ef Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 13 Feb 2026 18:08:47 -0800 Subject: [PATCH 16/21] Update basic_agent_task.ts --- examples/src/basic_agent_task.ts | 86 ++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index 8e1a2038c..95a5140ff 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -15,26 +15,20 @@ import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; -type SurveyResult = { - name: string; - role: string; -}; - -class IntroTask extends voice.AgentTask { - constructor() { +class InfoTask extends voice.AgentTask { + constructor(info: string) { super({ - instructions: - 'Collect the user name and role. Ask concise follow-up questions if information is missing.', + instructions: `Collect the user's information. around ${info}`, + tts: 'elevenlabs/eleven_turbo_v2_5', tools: { - completeIntro: llm.tool({ - description: 'Call this after collecting the user name and role.', + saveUserInfo: llm.tool({ + description: `Save the user's ${info} to database`, parameters: z.object({ - name: z.string().describe('User name'), - role: z.string().describe('User role'), + [info]: z.string(), }), - execute: async ({ name, role }) => { - this.complete({ name, role }); - return 'Thanks, collected successfully.'; + execute: async (args) => { + this.complete(args[info] as string); + return `Thanks, collected ${info} successfully: ${args[info]}`; }, }), }, @@ -43,7 +37,7 @@ class IntroTask extends voice.AgentTask { async onEnter() { this.session.generateReply({ - userInput: 'Greet user and ask the user for their name and role', + userInput: 'Ask the user for their ${info}', }); } } @@ -53,15 +47,63 @@ class SurveyAgent extends voice.Agent { super({ instructions: 'You orchestrate a short intro survey. Speak naturally and keep the interaction brief.', + tools: { + collectUserInfo: llm.tool({ + description: 'Call this when user want to provide some information to you', + parameters: z.object({ + key: z + .string() + .describe( + 'The key of the information to collect, e.g. "name" or "role" should be no space and underscore separated', + ), + }), + execute: async ({ key }) => { + const value = await new InfoTask(key).run(); + return `Collected ${key} successfully: ${value}`; + }, + }), + transferToWeatherAgent: llm.tool({ + description: 'Call this immediately after user want to know the weather', + execute: async () => { + const agent = new voice.Agent({ + instructions: + 'You are a weather agent. You are responsible for providing the weather information to the user.', + tts: 'deepgram/aura-2', + tools: { + getWeather: llm.tool({ + description: 'Get the weather for a given location', + parameters: z.object({ + location: z.string().describe('The location to get the weather for'), + }), + execute: async ({ location }) => { + return `The weather in ${location} is sunny today.`; + }, + }), + finishWeatherConversation: llm.tool({ + description: 'Call this when you want to finish the weather conversation', + execute: async () => { + return llm.handoff({ + agent: new SurveyAgent(), + returns: 'Transfer to survey agent successfully!', + }); + }, + }), + }, + }); + + return llm.handoff({ agent, returns: "Let's start the weather conversation!" }); + }, + }), + }, }); } async onEnter() { - const task = new IntroTask(); - const result = await task.run(); + const name = await new InfoTask('name').run(); + const role = await new InfoTask('role').run(); + await this.session.say( - `Great to meet you ${result.name}. I noted your role as ${result.role}. We can continue now.`, - { addToChatCtx: true }, + `Great to meet you ${name}. I noted your role as ${role}. We can continue now.`, ); } } @@ -74,7 +116,7 @@ export default defineAgent({ const session = new voice.AgentSession({ vad: ctx.proc.userData.vad as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3' }), - llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), + llm: new inference.LLM({ model: 'openai/gpt-5.2' }), tts: new inference.TTS({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', From a4f051f92e4dd7c4afcba96d7b0d7f0e2b103892 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Tue, 17 Feb 2026 16:17:09 -0800 Subject: [PATCH 17/21] Update basic_agent_task.ts --- examples/src/basic_agent_task.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index 95a5140ff..3bad8eef8 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -4,7 +4,7 @@ import { type JobContext, type JobProcess, - WorkerOptions, + ServerOptions, cli, defineAgent, inference, @@ -130,4 +130,4 @@ export default defineAgent({ }, }); -cli.runApp(new WorkerOptions({ agent: fileURLToPath(import.meta.url) })); +cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url) })); From f03509284d871d62b9f7861f0573d10a9840e875 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Thu, 19 Feb 2026 23:06:48 -0800 Subject: [PATCH 18/21] use multi input stream --- agents/src/voice/agent_activity.ts | 35 ++++++++----------- plugins/phonic/src/realtime/realtime_model.ts | 1 + 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index e895888cd..61e4481d9 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -35,7 +35,7 @@ import type { TTSMetrics, VADMetrics, } from '../metrics/base.js'; -import { DeferredReadableStream } from '../stream/deferred_stream.js'; +import { MultiInputStream } from '../stream/multi_input_stream.js'; import { STT, type STTError, type SpeechEvent } from '../stt/stt.js'; import { recordRealtimeMetrics, traceTypes, tracer } from '../telemetry/index.js'; import { splitWords } from '../tokenize/basic/word.js'; @@ -113,7 +113,9 @@ export class AgentActivity implements RecognitionHooks { private q_updated: Future; private speechTasks: Set> = new Set(); private lock = new Mutex(); - private audioStream = new DeferredReadableStream(); + private audioStream = new MultiInputStream(); + private audioStreamId?: string; + // default to null as None, which maps to the default provider tool choice value private toolChoice: ToolChoice | null = null; private _preemptiveGeneration?: PreemptiveGeneration; @@ -452,23 +454,10 @@ export class AgentActivity implements RecognitionHooks { } attachAudioInput(audioStream: ReadableStream): void { - const currentDeferredStream = this.audioStream; - if (currentDeferredStream.isSourceSet) { - this.logger.debug('detaching existing audio input in agent activity'); - void currentDeferredStream.detachSource().catch((error) => { - this.logger.debug({ error }, 'error detaching existing audio input in agent activity'); - }); - } + void this.audioStream.close(); + this.audioStream = new MultiInputStream(); - /** - * We need to add a deferred ReadableStream layer on top of the audioStream from the agent session. - * The tee() operation should be applied to the deferred stream, not the original audioStream. - * This is important because teeing the original stream directly makes it very difficult—if not - * impossible—to implement stream unlock logic cleanly. - */ - // Recreate the deferred stream each attach because tee() locks the underlying readable stream. - this.audioStream = new DeferredReadableStream(); - this.audioStream.setSource(audioStream); + this.audioStreamId = this.audioStream.addInputStream(audioStream); const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee(); if (this.realtimeSession) { @@ -481,9 +470,13 @@ export class AgentActivity implements RecognitionHooks { } detachAudioInput(): void { - void this.audioStream.detachSource().catch((error) => { - this.logger.debug({ error }, 'error detaching audio input in agent activity'); - }); + if (this.audioStreamId === undefined) { + return; + } + + void this.audioStream.close(); + this.audioStream = new MultiInputStream(); + this.audioStreamId = undefined; } commitUserTurn( diff --git a/plugins/phonic/src/realtime/realtime_model.ts b/plugins/phonic/src/realtime/realtime_model.ts index 96c9a72d5..d17c9cb2a 100644 --- a/plugins/phonic/src/realtime/realtime_model.ts +++ b/plugins/phonic/src/realtime/realtime_model.ts @@ -128,6 +128,7 @@ export class RealtimeModel extends llm.RealtimeModel { // TODO @Phonic-Co: Implement tool support // Phonic has automatic tool reply generation, but tools are not supported with LiveKit Agents yet. autoToolReplyGeneration: true, + manualFunctionCalls: false, audioOutput: true, }); From 5e21fb577768435dfbcc9a4f78c52abcfb2d19a4 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 20 Feb 2026 16:14:09 -0800 Subject: [PATCH 19/21] fix duplicated tool calls --- agents/src/llm/provider_format/utils.ts | 10 ++++++---- agents/src/voice/agent_activity.ts | 13 ++++++------- examples/src/basic_agent_task.ts | 2 +- plugins/livekit/src/turn_detector/multilingual.ts | 11 ++++++----- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/agents/src/llm/provider_format/utils.ts b/agents/src/llm/provider_format/utils.ts index 20dd8fe93..dea9e3abe 100644 --- a/agents/src/llm/provider_format/utils.ts +++ b/agents/src/llm/provider_format/utils.ts @@ -56,12 +56,14 @@ class ChatItemGroup { } removeInvalidToolCalls() { - if (this.toolCalls.length === this.toolOutputs.length) { - return; - } - const toolCallIds = new Set(this.toolCalls.map((call) => call.callId)); const toolOutputIds = new Set(this.toolOutputs.map((output) => output.callId)); + const sameIds = + toolCallIds.size === toolOutputIds.size && + [...toolCallIds].every((id) => toolOutputIds.has(id)); + if (this.toolCalls.length === this.toolOutputs.length && sameIds) { + return; + } // intersection of tool call ids and tool output ids const validCallIds = intersection(toolCallIds, toolOutputIds); diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 61e4481d9..d3580b1ec 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1651,13 +1651,15 @@ export class AgentActivity implements RecognitionHooks { for (const msg of toolsMessages) { msg.createdAt = replyStartedAt; } - this.agent._chatCtx.insert(toolsMessages); - // Only add FunctionCallOutput items to session history since FunctionCall items - // were already added by onToolExecutionStarted when the tool execution began + // Only insert FunctionCallOutput items into agent._chatCtx since FunctionCall items + // were already added by onToolExecutionStarted when the tool execution began. + // Inserting function_calls again would create duplicates that break provider APIs + // (e.g. Google's "function response parts != function call parts" error). const toolCallOutputs = toolsMessages.filter( (m): m is FunctionCallOutput => m.type === 'function_call_output', ); if (toolCallOutputs.length > 0) { + this.agent._chatCtx.insert(toolCallOutputs); this.agentSession._toolItemsAdded(toolCallOutputs); } } @@ -1819,15 +1821,12 @@ export class AgentActivity implements RecognitionHooks { msg.createdAt = replyStartedAt; } - this.agent._chatCtx.insert(toolMessages); - - // Only add FunctionCallOutput items to session history since FunctionCall items - // were already added by onToolExecutionStarted when the tool execution began const toolCallOutputs = toolMessages.filter( (m): m is FunctionCallOutput => m.type === 'function_call_output', ); if (toolCallOutputs.length > 0) { + this.agent._chatCtx.insert(toolCallOutputs); this.agentSession._toolItemsAdded(toolCallOutputs); } } diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index 3bad8eef8..60218b159 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -18,7 +18,7 @@ import { z } from 'zod'; class InfoTask extends voice.AgentTask { constructor(info: string) { super({ - instructions: `Collect the user's information. around ${info}`, + instructions: `Collect the user's information. around ${info}. Once you have the information, call the saveUserInfo tool to save the information to the database IMMEDIATELY. DO NOT have chitchat with the user, just collect the information and call the saveUserInfo tool.`, tts: 'elevenlabs/eleven_turbo_v2_5', tools: { saveUserInfo: llm.tool({ diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts index 318e72134..592f1c7c4 100644 --- a/plugins/livekit/src/turn_detector/multilingual.ts +++ b/plugins/livekit/src/turn_detector/multilingual.ts @@ -135,11 +135,12 @@ export class MultilingualModel extends EOUModel { } function remoteInferenceUrl() { - const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; - if (!urlBase) { - return undefined; - } - return `${urlBase}/eot/multi`; + return undefined; + // const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; + // if (!urlBase) { + // return undefined; + // } + // return `${urlBase}/eot/multi`; } export default EUORunnerMultilingual; From 23e2f7a537bbcce0bc387df1d5605901805b6fb8 Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 20 Feb 2026 17:09:22 -0800 Subject: [PATCH 20/21] fix minor bugs & improve stabilities --- agents/src/cli.ts | 53 +++++++++++----------------- agents/src/ipc/job_proc_lazy_main.ts | 21 ++++++++--- agents/src/voice/agent_activity.ts | 3 ++ examples/src/basic_agent_task.ts | 3 +- 4 files changed, 41 insertions(+), 39 deletions(-) diff --git a/agents/src/cli.ts b/agents/src/cli.ts index 2cc354f7d..1e53c16c0 100644 --- a/agents/src/cli.ts +++ b/agents/src/cli.ts @@ -77,16 +77,16 @@ const runServer = async (args: CliArgs) => { * ``` */ export const runApp = (opts: ServerOptions) => { + const logLevelOption = (defaultLevel: string) => + new Option('--log-level ', 'Set the logging level') + .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal']) + .default(defaultLevel) + .env('LOG_LEVEL'); + const program = new Command() .name('agents') .description('LiveKit Agents CLI') .version(version) - .addOption( - new Option('--log-level ', 'Set the logging level') - .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal']) - .default('info') - .env('LOG_LEVEL'), - ) .addOption( new Option('--url ', 'LiveKit server or Cloud project websocket URL').env( 'LIVEKIT_URL', @@ -120,13 +120,15 @@ export const runApp = (opts: ServerOptions) => { program .command('start') .description('Start the worker in production mode') - .action(() => { - const options = program.optsWithGlobals(); - opts.wsURL = options.url || opts.wsURL; - opts.apiKey = options.apiKey || opts.apiKey; - opts.apiSecret = options.apiSecret || opts.apiSecret; - opts.logLevel = options.logLevel || opts.logLevel; - opts.workerToken = options.workerToken || opts.workerToken; + .addOption(logLevelOption('info')) + .action((...[, command]) => { + const globalOptions = program.optsWithGlobals(); + const commandOptions = command.opts(); + opts.wsURL = globalOptions.url || opts.wsURL; + opts.apiKey = globalOptions.apiKey || opts.apiKey; + opts.apiSecret = globalOptions.apiSecret || opts.apiSecret; + opts.logLevel = commandOptions.logLevel; + opts.workerToken = globalOptions.workerToken || opts.workerToken; runServer({ opts, production: true, @@ -137,19 +139,14 @@ export const runApp = (opts: ServerOptions) => { program .command('dev') .description('Start the worker in development mode') - .addOption( - new Option('--log-level ', 'Set the logging level') - .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal']) - .default('debug') - .env('LOG_LEVEL'), - ) + .addOption(logLevelOption('debug')) .action((...[, command]) => { const globalOptions = program.optsWithGlobals(); const commandOptions = command.opts(); opts.wsURL = globalOptions.url || opts.wsURL; opts.apiKey = globalOptions.apiKey || opts.apiKey; opts.apiSecret = globalOptions.apiSecret || opts.apiSecret; - opts.logLevel = commandOptions.logLevel || globalOptions.logLevel || opts.logLevel; + opts.logLevel = commandOptions.logLevel; opts.workerToken = globalOptions.workerToken || opts.workerToken; runServer({ opts, @@ -163,19 +160,14 @@ export const runApp = (opts: ServerOptions) => { .description('Connect to a specific room') .requiredOption('--room ', 'Room name to connect to') .option('--participant-identity ', 'Identity of user to listen to') - .addOption( - new Option('--log-level ', 'Set the logging level') - .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal']) - .default('debug') - .env('LOG_LEVEL'), - ) + .addOption(logLevelOption('info')) .action((...[, command]) => { const globalOptions = program.optsWithGlobals(); const commandOptions = command.opts(); opts.wsURL = globalOptions.url || opts.wsURL; opts.apiKey = globalOptions.apiKey || opts.apiKey; opts.apiSecret = globalOptions.apiSecret || opts.apiSecret; - opts.logLevel = commandOptions.logLevel || globalOptions.logLevel || opts.logLevel; + opts.logLevel = commandOptions.logLevel; opts.workerToken = globalOptions.workerToken || opts.workerToken; runServer({ opts, @@ -189,12 +181,7 @@ export const runApp = (opts: ServerOptions) => { program .command('download-files') .description('Download plugin dependency files') - .addOption( - new Option('--log-level ', 'Set the logging level') - .choices(['trace', 'debug', 'info', 'warn', 'error', 'fatal']) - .default('debug') - .env('LOG_LEVEL'), - ) + .addOption(logLevelOption('debug')) .action((...[, command]) => { const commandOptions = command.opts(); initializeLogger({ pretty: true, level: commandOptions.logLevel }); diff --git a/agents/src/ipc/job_proc_lazy_main.ts b/agents/src/ipc/job_proc_lazy_main.ts index 11fe2a0c9..2448614ee 100644 --- a/agents/src/ipc/job_proc_lazy_main.ts +++ b/agents/src/ipc/job_proc_lazy_main.ts @@ -15,6 +15,14 @@ import type { IPCMessage } from './message.js'; const ORPHANED_TIMEOUT = 15 * 1000; +const safeSend = (msg: IPCMessage): boolean => { + if (process.connected && process.send) { + process.send(msg); + return true; + } + return false; +}; + type JobTask = { ctx: JobContext; task: Promise; @@ -50,7 +58,10 @@ class InfClient implements InferenceExecutor { async doInference(method: string, data: unknown): Promise { const requestId = shortuuid('inference_job_'); - process.send!({ case: 'inferenceRequest', value: { requestId, method, data } }); + if (!safeSend({ case: 'inferenceRequest', value: { requestId, method, data } })) { + throw new Error('IPC channel closed'); + } + this.#requests[requestId] = new PendingInference(); const resp = await this.#requests[requestId]!.promise; if (resp.error) { @@ -117,7 +128,7 @@ const startJob = ( await once(closeEvent, 'close').then((close) => { logger.debug('shutting down'); shutdown = true; - process.send!({ case: 'exiting', value: { reason: close[1] } }); + safeSend({ case: 'exiting', value: { reason: close[1] } }); }); // Close the primary agent session if it exists @@ -139,7 +150,7 @@ const startJob = ( logger.error({ error }, 'error while shutting down the job'), ); - process.send!({ case: 'done' }); + safeSend({ case: 'done', value: undefined }); joinFuture.resolve(); })(); @@ -199,7 +210,7 @@ const startJob = ( logger.debug('initializing job runner'); await agent.prewarm(proc); logger.debug('job runner initialized'); - process.send({ case: 'initializeResponse' }); + safeSend({ case: 'initializeResponse', value: undefined }); let job: JobTask | undefined = undefined; const closeEvent = new EventEmitter(); @@ -213,7 +224,7 @@ const startJob = ( switch (msg.case) { case 'pingRequest': { orphanedTimeout.refresh(); - process.send!({ + safeSend({ case: 'pongResponse', value: { lastTimestamp: msg.value.timestamp, timestamp: Date.now() }, }); diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index d3580b1ec..9310459e5 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -342,6 +342,9 @@ export class AgentActivity implements RecognitionHooks { minEndpointingDelay: this.agentSession.options.minEndpointingDelay, maxEndpointingDelay: this.agentSession.options.maxEndpointingDelay, rootSpanContext: this.agentSession.rootSpanContext, + sttModel: this.stt?.label, + sttProvider: this.getSttProvider(), + getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant, }); this.audioRecognition.start(); this.started = true; diff --git a/examples/src/basic_agent_task.ts b/examples/src/basic_agent_task.ts index 60218b159..e01d24752 100644 --- a/examples/src/basic_agent_task.ts +++ b/examples/src/basic_agent_task.ts @@ -11,6 +11,7 @@ import { llm, voice, } from '@livekit/agents'; +import * as openai from '@livekit/agents-plugin-openai'; import * as silero from '@livekit/agents-plugin-silero'; import { fileURLToPath } from 'node:url'; import { z } from 'zod'; @@ -116,7 +117,7 @@ export default defineAgent({ const session = new voice.AgentSession({ vad: ctx.proc.userData.vad as silero.VAD, stt: new inference.STT({ model: 'deepgram/nova-3' }), - llm: new inference.LLM({ model: 'openai/gpt-5.2' }), + llm: new openai.responses.LLM(), tts: new inference.TTS({ model: 'cartesia/sonic-3', voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', From 5bb9f770ef314fd6ed4cafe286741e092af9a14c Mon Sep 17 00:00:00 2001 From: Brian Yin Date: Fri, 20 Feb 2026 17:09:55 -0800 Subject: [PATCH 21/21] Update multilingual.ts --- plugins/livekit/src/turn_detector/multilingual.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/plugins/livekit/src/turn_detector/multilingual.ts b/plugins/livekit/src/turn_detector/multilingual.ts index 592f1c7c4..318e72134 100644 --- a/plugins/livekit/src/turn_detector/multilingual.ts +++ b/plugins/livekit/src/turn_detector/multilingual.ts @@ -135,12 +135,11 @@ export class MultilingualModel extends EOUModel { } function remoteInferenceUrl() { - return undefined; - // const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; - // if (!urlBase) { - // return undefined; - // } - // return `${urlBase}/eot/multi`; + const urlBase = process.env.LIVEKIT_REMOTE_EOT_URL; + if (!urlBase) { + return undefined; + } + return `${urlBase}/eot/multi`; } export default EUORunnerMultilingual;