diff --git a/.changeset/brave-nights-shout.md b/.changeset/brave-nights-shout.md new file mode 100644 index 00000000..0098d79d --- /dev/null +++ b/.changeset/brave-nights-shout.md @@ -0,0 +1,6 @@ +--- +'@tanstack/ai-client': patch +'@tanstack/ai': patch +--- + +feat: Add multimodal UIMessage support for images, audio, video, and documents diff --git a/packages/typescript/ai-client/src/index.ts b/packages/typescript/ai-client/src/index.ts index 5bc664c0..0e9834fe 100644 --- a/packages/typescript/ai-client/src/index.ts +++ b/packages/typescript/ai-client/src/index.ts @@ -7,6 +7,11 @@ export type { ToolCallPart, ToolResultPart, ThinkingPart, + // Multimodal message parts + ImageMessagePart, + AudioMessagePart, + VideoMessagePart, + DocumentMessagePart, // Client configuration types ChatClientOptions, ChatRequestBody, diff --git a/packages/typescript/ai-client/src/types.ts b/packages/typescript/ai-client/src/types.ts index 4f83debb..12bb9e5a 100644 --- a/packages/typescript/ai-client/src/types.ts +++ b/packages/typescript/ai-client/src/types.ts @@ -1,6 +1,7 @@ import type { AnyClientTool, ChunkStrategy, + ContentPartSource, InferToolInput, InferToolOutput, ModelMessage, @@ -114,11 +115,63 @@ export interface ThinkingPart { content: string } +/** + * Image content part for UIMessage. + * Preserves image data during ModelMessage <-> UIMessage conversions. + * @template TMetadata - Provider-specific metadata type (e.g., OpenAI's detail level) + */ +export interface ImageMessagePart { + type: 'image' + source: ContentPartSource + /** Provider-specific metadata (e.g., OpenAI's detail: 'auto' | 'low' | 'high') */ + metadata?: TMetadata +} + +/** + * Audio content part for UIMessage. + * Preserves audio data during ModelMessage <-> UIMessage conversions. + * @template TMetadata - Provider-specific metadata type + */ +export interface AudioMessagePart { + type: 'audio' + source: ContentPartSource + /** Provider-specific metadata (e.g., format, sample rate) */ + metadata?: TMetadata +} + +/** + * Video content part for UIMessage. + * Preserves video data during ModelMessage <-> UIMessage conversions. + * @template TMetadata - Provider-specific metadata type + */ +export interface VideoMessagePart { + type: 'video' + source: ContentPartSource + /** Provider-specific metadata (e.g., duration, resolution) */ + metadata?: TMetadata +} + +/** + * Document content part for UIMessage (e.g., PDFs). + * Preserves document data during ModelMessage <-> UIMessage conversions. + * @template TMetadata - Provider-specific metadata type (e.g., Anthropic's media_type) + */ +export interface DocumentMessagePart { + type: 'document' + source: ContentPartSource + /** Provider-specific metadata (e.g., media_type for PDFs) */ + metadata?: TMetadata +} + export type MessagePart = any> = | TextPart | ToolCallPart | ToolResultPart | ThinkingPart + | ImageMessagePart + | AudioMessagePart + | VideoMessagePart + | DocumentMessagePart /** * UIMessage - Domain-specific message format optimized for building chat UIs diff --git a/packages/typescript/ai/src/activities/chat/messages.ts b/packages/typescript/ai/src/activities/chat/messages.ts index 14c8dc62..607cb742 100644 --- a/packages/typescript/ai/src/activities/chat/messages.ts +++ b/packages/typescript/ai/src/activities/chat/messages.ts @@ -1,11 +1,15 @@ import type { + AudioMessagePart, ContentPart, + DocumentMessagePart, + ImageMessagePart, MessagePart, ModelMessage, TextPart, ToolCallPart, ToolResultPart, UIMessage, + VideoMessagePart, } from '../../types' // =========================== // Message Converters @@ -29,6 +33,58 @@ function getTextContent(content: string | null | Array): string { .join('') } +/** + * Convert ContentPart array to MessagePart array + * Preserves all multimodal content (text, image, audio, video, document) + */ +function contentPartsToMessageParts( + contentParts: Array, +): Array { + const messageParts: Array = [] + + for (const part of contentParts) { + switch (part.type) { + case 'text': + messageParts.push({ + type: 'text', + content: part.content, + ...(part.metadata !== undefined && { metadata: part.metadata }), + } as TextPart) + break + case 'image': + messageParts.push({ + type: 'image', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + } as ImageMessagePart) + break + case 'audio': + messageParts.push({ + type: 'audio', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + } as AudioMessagePart) + break + case 'video': + messageParts.push({ + type: 'video', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + } as VideoMessagePart) + break + case 'document': + messageParts.push({ + type: 'document', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + } as DocumentMessagePart) + break + } + } + + return messageParts +} + /** * Convert UIMessages or ModelMessages to ModelMessages */ @@ -52,7 +108,8 @@ export function convertMessagesToModelMessages( * Convert a UIMessage to ModelMessage(s) * * This conversion handles the parts-based structure: - * - Text parts → content field + * - Text parts → content field (string or ContentPart[]) + * - Multimodal parts (image, audio, video, document) → ContentPart[] * - ToolCall parts → toolCalls array * - ToolResult parts → separate role="tool" messages * @@ -72,12 +129,24 @@ export function uiMessageToModelMessages( // Separate parts by type // Note: thinking parts are UI-only and not included in ModelMessages const textParts: Array = [] + const imageParts: Array = [] + const audioParts: Array = [] + const videoParts: Array = [] + const documentParts: Array = [] const toolCallParts: Array = [] const toolResultParts: Array = [] for (const part of uiMessage.parts) { if (part.type === 'text') { textParts.push(part) + } else if (part.type === 'image') { + imageParts.push(part) + } else if (part.type === 'audio') { + audioParts.push(part) + } else if (part.type === 'video') { + videoParts.push(part) + } else if (part.type === 'document') { + documentParts.push(part) } else if (part.type === 'tool-call') { toolCallParts.push(part) } else if (part.type === 'tool-result') { @@ -86,8 +155,55 @@ export function uiMessageToModelMessages( // thinking parts are skipped - they're UI-only } - // Build the main message (user or assistant) - const content = textParts.map((p) => p.content).join('') || null + const hasMultimodalContent = + imageParts.length > 0 || + audioParts.length > 0 || + videoParts.length > 0 || + documentParts.length > 0 + + // Build the content field - use ContentPart[] if multimodal, string otherwise + let content: string | null | Array + if (hasMultimodalContent) { + const contentParts: Array = [] + for (const part of uiMessage.parts) { + if (part.type === 'text') { + contentParts.push({ + type: 'text', + content: part.content, + ...(part.metadata !== undefined && { metadata: part.metadata }), + }) + } else if (part.type === 'image') { + contentParts.push({ + type: 'image', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + }) + } else if (part.type === 'audio') { + contentParts.push({ + type: 'audio', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + }) + } else if (part.type === 'video') { + contentParts.push({ + type: 'video', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + }) + } else if (part.type === 'document') { + contentParts.push({ + type: 'document', + source: part.source, + ...(part.metadata !== undefined && { metadata: part.metadata }), + }) + } + } + content = contentParts.length > 0 ? contentParts : null + } else { + // Text-only: use simple string + content = textParts.map((p) => p.content).join('') || null + } + const toolCalls = toolCallParts.length > 0 ? toolCallParts @@ -144,7 +260,7 @@ export function uiMessageToModelMessages( * Convert a ModelMessage to UIMessage * * This conversion creates a parts-based structure: - * - content field → TextPart + * - content field → TextPart (for string) or multimodal MessageParts (for ContentPart[]) * - toolCalls array → ToolCallPart[] * - role="tool" messages should be converted separately and merged * @@ -158,13 +274,18 @@ export function modelMessageToUIMessage( ): UIMessage { const parts: Array = [] - // Handle content (convert multimodal content to text for UI) - const textContent = getTextContent(modelMessage.content) - if (textContent) { - parts.push({ - type: 'text', - content: textContent, - }) + // Handle content - preserve multimodal content + if (modelMessage.content !== null) { + if (typeof modelMessage.content === 'string') { + if (modelMessage.content) { + parts.push({ + type: 'text', + content: modelMessage.content, + }) + } + } else if (Array.isArray(modelMessage.content)) { + parts.push(...contentPartsToMessageParts(modelMessage.content)) + } } // Handle tool calls diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts index 7c49d995..a4151f91 100644 --- a/packages/typescript/ai/src/types.ts +++ b/packages/typescript/ai/src/types.ts @@ -280,11 +280,43 @@ export interface ThinkingPart { content: string } +export interface ImageMessagePart { + type: 'image' + source: ContentPartSource + /** Provider-specific metadata (e.g., OpenAI's detail: 'auto' | 'low' | 'high') */ + metadata?: TMetadata +} + +export interface AudioMessagePart { + type: 'audio' + source: ContentPartSource + /** Provider-specific metadata (e.g., format, sample rate) */ + metadata?: TMetadata +} + +export interface VideoMessagePart { + type: 'video' + source: ContentPartSource + /** Provider-specific metadata (e.g., duration, resolution) */ + metadata?: TMetadata +} + +export interface DocumentMessagePart { + type: 'document' + source: ContentPartSource + /** Provider-specific metadata (e.g., media_type for PDFs) */ + metadata?: TMetadata +} + export type MessagePart = | TextPart | ToolCallPart | ToolResultPart | ThinkingPart + | ImageMessagePart + | AudioMessagePart + | VideoMessagePart + | DocumentMessagePart /** * UIMessage - Domain-specific message format optimized for building chat UIs diff --git a/packages/typescript/ai/tests/messages.test.ts b/packages/typescript/ai/tests/messages.test.ts new file mode 100644 index 00000000..4cde2b60 --- /dev/null +++ b/packages/typescript/ai/tests/messages.test.ts @@ -0,0 +1,190 @@ +import { describe, expect, it } from 'vitest' +import { + modelMessageToUIMessage, + uiMessageToModelMessages, +} from '../src/activities/chat/messages' +import type { ModelMessage, UIMessage } from '../src/types' + +describe('message converters', () => { + describe('modelMessageToUIMessage', () => { + it('should preserve text content', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: 'Hello world', + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(1) + expect(uiMessage.parts[0]).toEqual({ + type: 'text', + content: 'Hello world', + }) + }) + + it('should preserve multimodal content', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'What is in this image?' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/image.jpg' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(2) + expect(uiMessage.parts[0]).toEqual({ + type: 'text', + content: 'What is in this image?', + }) + expect(uiMessage.parts[1]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/image.jpg' }, + }) + }) + + it('should preserve all multimodal types', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'Check these:' }, + { type: 'image', source: { type: 'data', value: 'base64img' } }, + { + type: 'audio', + source: { type: 'url', value: 'https://example.com/audio.mp3' }, + }, + { + type: 'video', + source: { type: 'url', value: 'https://example.com/video.mp4' }, + }, + { type: 'document', source: { type: 'data', value: 'base64pdf' } }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts).toHaveLength(5) + expect(uiMessage.parts[0]?.type).toBe('text') + expect(uiMessage.parts[1]?.type).toBe('image') + expect(uiMessage.parts[2]?.type).toBe('audio') + expect(uiMessage.parts[3]?.type).toBe('video') + expect(uiMessage.parts[4]?.type).toBe('document') + }) + + it('should preserve metadata', () => { + const modelMessage: ModelMessage = { + role: 'user', + content: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + metadata: { detail: 'high' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(modelMessage) + + expect(uiMessage.parts[0]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + metadata: { detail: 'high' }, + }) + }) + }) + + describe('uiMessageToModelMessages', () => { + it('should convert text-only UIMessage to string content', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [{ type: 'text', content: 'Hello world' }], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + + expect(modelMessages).toHaveLength(1) + expect(modelMessages[0]?.content).toBe('Hello world') + }) + + it('should convert multimodal UIMessage to ContentPart array', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [ + { type: 'text', content: 'What is this?' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/img.jpg' }, + }, + ], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + + expect(modelMessages).toHaveLength(1) + expect(Array.isArray(modelMessages[0]?.content)).toBe(true) + expect(modelMessages[0]?.content).toHaveLength(2) + }) + + it('should preserve part order during conversion', () => { + const uiMessage: UIMessage = { + id: 'msg-1', + role: 'user', + parts: [ + { + type: 'image', + source: { type: 'url', value: 'https://example.com/1.jpg' }, + }, + { type: 'text', content: 'Middle text' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/2.jpg' }, + }, + ], + } + + const modelMessages = uiMessageToModelMessages(uiMessage) + const content = modelMessages[0]?.content as Array + + expect(content[0]?.type).toBe('image') + expect(content[1]?.type).toBe('text') + expect(content[2]?.type).toBe('image') + }) + }) + + describe('round-trip conversion', () => { + it('should preserve multimodal content through round-trip', () => { + const original: ModelMessage = { + role: 'user', + content: [ + { type: 'text', content: 'Describe this image' }, + { + type: 'image', + source: { type: 'url', value: 'https://example.com/photo.jpg' }, + }, + ], + } + + const uiMessage = modelMessageToUIMessage(original) + const [converted] = uiMessageToModelMessages(uiMessage) + + expect(converted?.role).toBe('user') + expect(Array.isArray(converted?.content)).toBe(true) + const content = converted?.content as Array + expect(content).toHaveLength(2) + expect(content[0]).toEqual({ + type: 'text', + content: 'Describe this image', + }) + expect(content[1]).toEqual({ + type: 'image', + source: { type: 'url', value: 'https://example.com/photo.jpg' }, + }) + }) + }) +})