Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/brave-nights-shout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'@tanstack/ai-client': patch
'@tanstack/ai': patch
---

feat: Add multimodal UIMessage support for images, audio, video, and documents
5 changes: 5 additions & 0 deletions packages/typescript/ai-client/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ export type {
ToolCallPart,
ToolResultPart,
ThinkingPart,
// Multimodal message parts
ImageMessagePart,
AudioMessagePart,
VideoMessagePart,
DocumentMessagePart,
// Client configuration types
ChatClientOptions,
ChatRequestBody,
Expand Down
53 changes: 53 additions & 0 deletions packages/typescript/ai-client/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type {
AnyClientTool,
ChunkStrategy,
ContentPartSource,
InferToolInput,
InferToolOutput,
ModelMessage,
Expand Down Expand Up @@ -114,11 +115,63 @@ export interface ThinkingPart {
content: string
}

/**
* Image content part for UIMessage.
* Preserves image data during ModelMessage <-> UIMessage conversions.
* @template TMetadata - Provider-specific metadata type (e.g., OpenAI's detail level)
*/
export interface ImageMessagePart<TMetadata = unknown> {
type: 'image'
source: ContentPartSource
/** Provider-specific metadata (e.g., OpenAI's detail: 'auto' | 'low' | 'high') */
metadata?: TMetadata
}

/**
* Audio content part for UIMessage.
* Preserves audio data during ModelMessage <-> UIMessage conversions.
* @template TMetadata - Provider-specific metadata type
*/
export interface AudioMessagePart<TMetadata = unknown> {
type: 'audio'
source: ContentPartSource
/** Provider-specific metadata (e.g., format, sample rate) */
metadata?: TMetadata
}

/**
* Video content part for UIMessage.
* Preserves video data during ModelMessage <-> UIMessage conversions.
* @template TMetadata - Provider-specific metadata type
*/
export interface VideoMessagePart<TMetadata = unknown> {
type: 'video'
source: ContentPartSource
/** Provider-specific metadata (e.g., duration, resolution) */
metadata?: TMetadata
}

/**
* Document content part for UIMessage (e.g., PDFs).
* Preserves document data during ModelMessage <-> UIMessage conversions.
* @template TMetadata - Provider-specific metadata type (e.g., Anthropic's media_type)
*/
export interface DocumentMessagePart<TMetadata = unknown> {
type: 'document'
source: ContentPartSource
/** Provider-specific metadata (e.g., media_type for PDFs) */
metadata?: TMetadata
}

export type MessagePart<TTools extends ReadonlyArray<AnyClientTool> = any> =
| TextPart
| ToolCallPart<TTools>
| ToolResultPart
| ThinkingPart
| ImageMessagePart
| AudioMessagePart
| VideoMessagePart
| DocumentMessagePart

/**
* UIMessage - Domain-specific message format optimized for building chat UIs
Expand Down
143 changes: 132 additions & 11 deletions packages/typescript/ai/src/activities/chat/messages.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import type {
AudioMessagePart,
ContentPart,
DocumentMessagePart,
ImageMessagePart,
MessagePart,
ModelMessage,
TextPart,
ToolCallPart,
ToolResultPart,
UIMessage,
VideoMessagePart,
} from '../../types'
// ===========================
// Message Converters
Expand All @@ -29,6 +33,58 @@ function getTextContent(content: string | null | Array<ContentPart>): string {
.join('')
}

/**
* Convert ContentPart array to MessagePart array
* Preserves all multimodal content (text, image, audio, video, document)
*/
function contentPartsToMessageParts(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit confused, don't these two types match identically? from what I see what you're doing is just coping the old data into the new one?

contentParts: Array<ContentPart>,
): Array<MessagePart> {
const messageParts: Array<MessagePart> = []

for (const part of contentParts) {
switch (part.type) {
case 'text':
messageParts.push({
type: 'text',
content: part.content,
...(part.metadata !== undefined && { metadata: part.metadata }),
} as TextPart)
break
case 'image':
messageParts.push({
type: 'image',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
} as ImageMessagePart)
break
case 'audio':
messageParts.push({
type: 'audio',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
} as AudioMessagePart)
break
case 'video':
messageParts.push({
type: 'video',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
} as VideoMessagePart)
break
case 'document':
messageParts.push({
type: 'document',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
} as DocumentMessagePart)
break
}
}

return messageParts
}

/**
* Convert UIMessages or ModelMessages to ModelMessages
*/
Expand All @@ -52,7 +108,8 @@ export function convertMessagesToModelMessages(
* Convert a UIMessage to ModelMessage(s)
*
* This conversion handles the parts-based structure:
* - Text parts β†’ content field
* - Text parts β†’ content field (string or ContentPart[])
* - Multimodal parts (image, audio, video, document) β†’ ContentPart[]
* - ToolCall parts β†’ toolCalls array
* - ToolResult parts β†’ separate role="tool" messages
*
Expand All @@ -72,12 +129,24 @@ export function uiMessageToModelMessages(
// Separate parts by type
// Note: thinking parts are UI-only and not included in ModelMessages
const textParts: Array<TextPart> = []
const imageParts: Array<ImageMessagePart> = []
const audioParts: Array<AudioMessagePart> = []
const videoParts: Array<VideoMessagePart> = []
const documentParts: Array<DocumentMessagePart> = []
const toolCallParts: Array<ToolCallPart> = []
const toolResultParts: Array<ToolResultPart> = []

for (const part of uiMessage.parts) {
if (part.type === 'text') {
textParts.push(part)
} else if (part.type === 'image') {
imageParts.push(part)
} else if (part.type === 'audio') {
audioParts.push(part)
} else if (part.type === 'video') {
videoParts.push(part)
} else if (part.type === 'document') {
documentParts.push(part)
} else if (part.type === 'tool-call') {
toolCallParts.push(part)
} else if (part.type === 'tool-result') {
Expand All @@ -86,8 +155,55 @@ export function uiMessageToModelMessages(
// thinking parts are skipped - they're UI-only
}

// Build the main message (user or assistant)
const content = textParts.map((p) => p.content).join('') || null
const hasMultimodalContent =
imageParts.length > 0 ||
audioParts.length > 0 ||
videoParts.length > 0 ||
documentParts.length > 0

// Build the content field - use ContentPart[] if multimodal, string otherwise
let content: string | null | Array<ContentPart>
if (hasMultimodalContent) {
const contentParts: Array<ContentPart> = []
for (const part of uiMessage.parts) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could've used the utlity function you created? is there some weird drift in types that causes it to fail due to some undefined check? it should be mappable 1:1

if (part.type === 'text') {
contentParts.push({
type: 'text',
content: part.content,
...(part.metadata !== undefined && { metadata: part.metadata }),
})
} else if (part.type === 'image') {
contentParts.push({
type: 'image',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
})
} else if (part.type === 'audio') {
contentParts.push({
type: 'audio',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
})
} else if (part.type === 'video') {
contentParts.push({
type: 'video',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
})
} else if (part.type === 'document') {
contentParts.push({
type: 'document',
source: part.source,
...(part.metadata !== undefined && { metadata: part.metadata }),
})
}
}
content = contentParts.length > 0 ? contentParts : null
} else {
// Text-only: use simple string
content = textParts.map((p) => p.content).join('') || null
}

const toolCalls =
toolCallParts.length > 0
? toolCallParts
Expand Down Expand Up @@ -144,7 +260,7 @@ export function uiMessageToModelMessages(
* Convert a ModelMessage to UIMessage
*
* This conversion creates a parts-based structure:
* - content field β†’ TextPart
* - content field β†’ TextPart (for string) or multimodal MessageParts (for ContentPart[])
* - toolCalls array β†’ ToolCallPart[]
* - role="tool" messages should be converted separately and merged
*
Expand All @@ -158,13 +274,18 @@ export function modelMessageToUIMessage(
): UIMessage {
const parts: Array<MessagePart> = []

// Handle content (convert multimodal content to text for UI)
const textContent = getTextContent(modelMessage.content)
if (textContent) {
parts.push({
type: 'text',
content: textContent,
})
// Handle content - preserve multimodal content
if (modelMessage.content !== null) {
if (typeof modelMessage.content === 'string') {
if (modelMessage.content) {
parts.push({
type: 'text',
content: modelMessage.content,
})
}
} else if (Array.isArray(modelMessage.content)) {
parts.push(...contentPartsToMessageParts(modelMessage.content))
}
}

// Handle tool calls
Expand Down
32 changes: 32 additions & 0 deletions packages/typescript/ai/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,43 @@ export interface ThinkingPart {
content: string
}

export interface ImageMessagePart<TMetadata = unknown> {
type: 'image'
source: ContentPartSource
/** Provider-specific metadata (e.g., OpenAI's detail: 'auto' | 'low' | 'high') */
metadata?: TMetadata
}

export interface AudioMessagePart<TMetadata = unknown> {
type: 'audio'
source: ContentPartSource
/** Provider-specific metadata (e.g., format, sample rate) */
metadata?: TMetadata
}

export interface VideoMessagePart<TMetadata = unknown> {
type: 'video'
source: ContentPartSource
/** Provider-specific metadata (e.g., duration, resolution) */
metadata?: TMetadata
}

export interface DocumentMessagePart<TMetadata = unknown> {
type: 'document'
source: ContentPartSource
/** Provider-specific metadata (e.g., media_type for PDFs) */
metadata?: TMetadata
}

export type MessagePart =
| TextPart
| ToolCallPart
| ToolResultPart
| ThinkingPart
| ImageMessagePart
| AudioMessagePart
| VideoMessagePart
| DocumentMessagePart

/**
* UIMessage - Domain-specific message format optimized for building chat UIs
Expand Down
Loading