From 1d908bd3732a1188b1d2fe965a283d6aa8fa6682 Mon Sep 17 00:00:00 2001 From: OpenClaw Date: Mon, 9 Feb 2026 23:24:55 +0800 Subject: [PATCH] feat: add vision/image support via stream-json stdin + CLI isolation Adds multimodal (text + image) support for OpenAI-compatible clients like Browser Use that send base64 screenshots via the chat completions endpoint. Changes: - Support OpenAI content arrays with text and image_url parts - Auto-detect images and switch to --input-format stream-json mode (text-only requests still use the fast CLI argument path) - Convert data URI images to Claude CLI base64 format via stdin piping - Strip code fences from model responses (Claude wraps JSON in fences) - Isolate CLI subprocess: --tools "", --disable-slash-commands, --setting-sources "", --system-prompt override, cwd /tmp - Bump body limit to 50mb for base64 image payloads Tested with Browser Use v0.11.9 running vision-enabled browser automation tasks (screenshots piped as base64 PNG). Co-Authored-By: Claude Opus 4.6 --- src/adapter/cli-to-openai.ts | 29 +++++- src/adapter/openai-to-cli.ts | 183 +++++++++++++++++++++++++++++++---- src/server/index.ts | 2 +- src/server/routes.ts | 2 + src/subprocess/manager.ts | 54 ++++++++--- src/types/openai.ts | 8 +- 6 files changed, 243 insertions(+), 35 deletions(-) diff --git a/src/adapter/cli-to-openai.ts b/src/adapter/cli-to-openai.ts index 1e43eab..20d6efc 100644 --- a/src/adapter/cli-to-openai.ts +++ b/src/adapter/cli-to-openai.ts @@ -5,6 +5,33 @@ import type { ClaudeCliAssistant, ClaudeCliResult } from "../types/claude-cli.js"; import type { OpenAIChatResponse, OpenAIChatChunk } from "../types/openai.js"; +/** + * Extract JSON content from model response. + * Claude often outputs prose before/after JSON in code fences. + * This extracts the last JSON code fence block, or falls back to + * finding a raw JSON object/array in the text. + */ +function stripCodeFences(text: string): string { + // Find all code fence blocks and take the last one (most likely the JSON) + const fenceMatches = [...text.matchAll(/```(?:\w*)\n([\s\S]*?)\n```/g)]; + if (fenceMatches.length > 0) { + return fenceMatches[fenceMatches.length - 1][1]; + } + + // Try to extract a raw JSON object or array + const jsonMatch = text.match(/(\{[\s\S]*\}|\[[\s\S]*\])/); + if (jsonMatch) { + try { + JSON.parse(jsonMatch[1]); + return jsonMatch[1]; + } catch { + // Not valid JSON, return original + } + } + + return text; +} + /** * Extract text content from Claude CLI assistant message */ @@ -84,7 +111,7 @@ export function cliResultToOpenai( index: 0, message: { role: "assistant", - content: result.result, + content: stripCodeFences(result.result), }, finish_reason: "stop", }, diff --git a/src/adapter/openai-to-cli.ts b/src/adapter/openai-to-cli.ts index c8ecaa1..1fae8cf 100644 --- a/src/adapter/openai-to-cli.ts +++ b/src/adapter/openai-to-cli.ts @@ -1,13 +1,52 @@ /** * Converts OpenAI chat request format to Claude CLI input + * + * Supports two modes: + * - Text-only: prompt passed as CLI argument (legacy) + * - Stream-JSON: NDJSON piped to stdin with full multimodal support (images) */ -import type { OpenAIChatRequest } from "../types/openai.js"; +import type { OpenAIChatRequest, OpenAIChatContentPart } from "../types/openai.js"; export type ClaudeModel = "opus" | "sonnet" | "haiku"; +/** + * Claude CLI stream-json content block types + */ +interface CliTextContent { + type: "text"; + text: string; +} + +interface CliImageContent { + type: "image"; + source: { + type: "base64"; + media_type: string; + data: string; + }; +} + +type CliContentBlock = CliTextContent | CliImageContent; + +/** + * NDJSON message format for Claude CLI --input-format stream-json + */ +export interface CliStreamMessage { + type: "user"; + message: { + role: "user"; + content: CliContentBlock[]; + }; +} + export interface CliInput { + /** Single prompt string (legacy text-only mode) */ prompt: string; + /** NDJSON lines for stdin piping (stream-json mode with image support) */ + stdinMessages: string[]; + /** Whether the request contains images and needs stream-json mode */ + hasImages: boolean; model: ClaudeModel; sessionId?: string; } @@ -31,45 +70,146 @@ const MODEL_MAP: Record = { * Extract Claude model alias from request model string */ export function extractModel(model: string): ClaudeModel { - // Try direct lookup if (MODEL_MAP[model]) { return MODEL_MAP[model]; } - - // Try stripping provider prefix const stripped = model.replace(/^claude-code-cli\//, ""); if (MODEL_MAP[stripped]) { return MODEL_MAP[stripped]; } - - // Default to opus (Claude Max subscription) return "opus"; } /** - * Convert OpenAI messages array to a single prompt string for Claude CLI + * Check if any message in the request contains images + */ +function requestHasImages(messages: OpenAIChatRequest["messages"]): boolean { + return messages.some((msg) => { + if (typeof msg.content === "string") return false; + return msg.content.some((part) => part.type === "image_url"); + }); +} + +/** + * Extract text from content (string or array of content parts). + */ +function extractText(content: string | OpenAIChatContentPart[]): string { + if (typeof content === "string") { + return content; + } + return content + .filter((part) => part.type === "text" && part.text) + .map((part) => part.text!) + .join("\n"); +} + +/** + * Convert an OpenAI image_url to a Claude CLI base64 image block. + * Handles data URIs (data:image/png;base64,...) and passes through the data. + */ +function convertImagePart(part: OpenAIChatContentPart): CliImageContent | null { + if (part.type !== "image_url" || !part.image_url) return null; + + const url = part.image_url.url; + + // Parse data URI: data:image/png;base64,iVBOR... + const match = url.match(/^data:(image\/[a-z+]+);base64,(.+)$/i); + if (!match) { + // Non-data-URI images (http URLs) are not supported by CLI + console.error("[openai-to-cli] Skipping non-data-URI image:", url.slice(0, 60)); + return null; + } + + return { + type: "image", + source: { + type: "base64", + media_type: match[1], + data: match[2], + }, + }; +} + +/** + * Convert OpenAI content parts to Claude CLI content blocks + */ +function convertContentParts(content: string | OpenAIChatContentPart[]): CliContentBlock[] { + if (typeof content === "string") { + return [{ type: "text", text: content }]; + } + + const blocks: CliContentBlock[] = []; + for (const part of content) { + if (part.type === "text" && part.text) { + blocks.push({ type: "text", text: part.text }); + } else if (part.type === "image_url") { + const img = convertImagePart(part); + if (img) blocks.push(img); + } + } + return blocks; +} + +/** + * Convert OpenAI messages to Claude CLI stream-json NDJSON lines. * - * Claude Code CLI in --print mode expects a single prompt, not a conversation. - * We format the messages into a readable format that preserves context. + * Claude CLI stream-json only accepts "user" role messages. + * System and assistant messages are inlined as tagged text blocks + * within the user message content. + */ +function messagesToStreamJson(messages: OpenAIChatRequest["messages"]): string[] { + // Collect all content blocks into a single user message + // (Claude CLI stream-json expects user-role messages only) + const allBlocks: CliContentBlock[] = []; + + for (const msg of messages) { + switch (msg.role) { + case "system": { + const text = extractText(msg.content); + allBlocks.push({ type: "text", text: `\n${text}\n` }); + break; + } + case "assistant": { + const text = extractText(msg.content); + allBlocks.push({ type: "text", text: `\n${text}\n` }); + break; + } + case "user": { + const blocks = convertContentParts(msg.content); + allBlocks.push(...blocks); + break; + } + } + } + + const stdinMsg: CliStreamMessage = { + type: "user", + message: { + role: "user", + content: allBlocks, + }, + }; + + return [JSON.stringify(stdinMsg)]; +} + +/** + * Convert OpenAI messages to a single prompt string (legacy text-only mode) */ export function messagesToPrompt(messages: OpenAIChatRequest["messages"]): string { const parts: string[] = []; for (const msg of messages) { + const text = extractText(msg.content); switch (msg.role) { case "system": - // System messages become context instructions - parts.push(`\n${msg.content}\n\n`); + parts.push(`\n${text}\n\n`); break; - case "user": - // User messages are the main prompt - parts.push(msg.content); + parts.push(text); break; - case "assistant": - // Previous assistant responses for context - parts.push(`\n${msg.content}\n\n`); + parts.push(`\n${text}\n\n`); break; } } @@ -78,12 +218,17 @@ export function messagesToPrompt(messages: OpenAIChatRequest["messages"]): strin } /** - * Convert OpenAI chat request to CLI input format + * Convert OpenAI chat request to CLI input format. + * Automatically chooses stream-json mode when images are present. */ export function openaiToCli(request: OpenAIChatRequest): CliInput { + const hasImages = requestHasImages(request.messages); + return { prompt: messagesToPrompt(request.messages), + stdinMessages: messagesToStreamJson(request.messages), + hasImages, model: extractModel(request.model), - sessionId: request.user, // Use OpenAI's user field for session mapping + sessionId: request.user, }; } diff --git a/src/server/index.ts b/src/server/index.ts index de8b73d..bafdbd5 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -22,7 +22,7 @@ function createApp(): Express { const app = express(); // Middleware - app.use(express.json({ limit: "10mb" })); + app.use(express.json({ limit: "50mb" })); // Large limit for base64 images // Request logging (debug mode) app.use((req: Request, _res: Response, next: NextFunction) => { diff --git a/src/server/routes.ts b/src/server/routes.ts index ffe2e5b..02583f2 100644 --- a/src/server/routes.ts +++ b/src/server/routes.ts @@ -179,6 +179,7 @@ async function handleStreamingResponse( subprocess.start(cliInput.prompt, { model: cliInput.model, sessionId: cliInput.sessionId, + stdinMessages: cliInput.hasImages ? cliInput.stdinMessages : undefined, }).catch((err) => { console.error("[Streaming] Subprocess start error:", err); reject(err); @@ -234,6 +235,7 @@ async function handleNonStreamingResponse( .start(cliInput.prompt, { model: cliInput.model, sessionId: cliInput.sessionId, + stdinMessages: cliInput.hasImages ? cliInput.stdinMessages : undefined, }) .catch((error) => { res.status(500).json({ diff --git a/src/subprocess/manager.ts b/src/subprocess/manager.ts index 6551a81..5bdd9af 100644 --- a/src/subprocess/manager.ts +++ b/src/subprocess/manager.ts @@ -43,17 +43,25 @@ export class ClaudeSubprocess extends EventEmitter { private isKilled: boolean = false; /** - * Start the Claude CLI subprocess with the given prompt + * Start the Claude CLI subprocess. + * + * Two modes: + * - Text mode (stdinMessages omitted): prompt passed as CLI argument + * - Stream-JSON mode (stdinMessages provided): messages piped via stdin (supports images) */ - async start(prompt: string, options: SubprocessOptions): Promise { - const args = this.buildArgs(prompt, options); + async start( + prompt: string, + options: SubprocessOptions & { stdinMessages?: string[] } + ): Promise { + const useStreamInput = !!options.stdinMessages?.length; + const args = this.buildArgs(useStreamInput ? null : prompt, options, useStreamInput); const timeout = options.timeout || DEFAULT_TIMEOUT; return new Promise((resolve, reject) => { try { // Use spawn() for security - no shell interpretation this.process = spawn("claude", args, { - cwd: options.cwd || process.cwd(), + cwd: options.cwd || "/tmp", // Use neutral dir to avoid loading CLAUDE.md env: { ...process.env }, stdio: ["pipe", "pipe", "pipe"], }); @@ -81,10 +89,18 @@ export class ClaudeSubprocess extends EventEmitter { } }); - // Close stdin since we pass prompt as argument + // Pipe messages via stdin or close it + if (useStreamInput && options.stdinMessages) { + for (const line of options.stdinMessages) { + this.process.stdin?.write(line + "\n"); + } + } this.process.stdin?.end(); - console.error(`[Subprocess] Process spawned with PID: ${this.process.pid}`); + console.error( + `[Subprocess] Process spawned with PID: ${this.process.pid}` + + ` (mode: ${useStreamInput ? "stream-json" : "text"})` + ); // Parse JSON stream from stdout this.process.stdout?.on("data", (chunk: Buffer) => { @@ -98,8 +114,6 @@ export class ClaudeSubprocess extends EventEmitter { this.process.stderr?.on("data", (chunk: Buffer) => { const errorText = chunk.toString().trim(); if (errorText) { - // Don't emit as error unless it's actually an error - // Claude CLI may write debug info to stderr console.error("[Subprocess stderr]:", errorText.slice(0, 200)); } }); @@ -108,14 +122,12 @@ export class ClaudeSubprocess extends EventEmitter { this.process.on("close", (code) => { console.error(`[Subprocess] Process closed with code: ${code}`); this.clearTimeout(); - // Process any remaining buffer if (this.buffer.trim()) { this.processBuffer(); } this.emit("close", code); }); - // Resolve immediately since we're streaming resolve(); } catch (err) { this.clearTimeout(); @@ -125,9 +137,16 @@ export class ClaudeSubprocess extends EventEmitter { } /** - * Build CLI arguments array + * Build CLI arguments array. + * @param prompt - Text prompt (null when using stream-json input) + * @param options - Subprocess options + * @param streamInput - Whether to use --input-format stream-json */ - private buildArgs(prompt: string, options: SubprocessOptions): string[] { + private buildArgs( + prompt: string | null, + options: SubprocessOptions, + streamInput: boolean + ): string[] { const args = [ "--print", // Non-interactive mode "--output-format", @@ -137,9 +156,18 @@ export class ClaudeSubprocess extends EventEmitter { "--model", options.model, // Model alias (opus/sonnet/haiku) "--no-session-persistence", // Don't save sessions - prompt, // Pass prompt as argument (more reliable than stdin) + "--tools", "", // Disable all tools — act as a plain chat model + "--disable-slash-commands", // Disable skills + "--setting-sources", "", // Don't load any settings files (CLAUDE.md etc.) + "--system-prompt", "You are a helpful assistant. Follow the user's instructions precisely. When asked to output JSON, output ONLY raw JSON without markdown code fences.", ]; + if (streamInput) { + args.push("--input-format", "stream-json"); + } else if (prompt) { + args.push(prompt); + } + if (options.sessionId) { args.push("--session-id", options.sessionId); } diff --git a/src/types/openai.ts b/src/types/openai.ts index c116658..7455fa1 100644 --- a/src/types/openai.ts +++ b/src/types/openai.ts @@ -3,9 +3,15 @@ * Used for Clawdbot integration */ +export interface OpenAIChatContentPart { + type: "text" | "image_url"; + text?: string; + image_url?: { url: string; detail?: string }; +} + export interface OpenAIChatMessage { role: "system" | "user" | "assistant"; - content: string; + content: string | OpenAIChatContentPart[]; } export interface OpenAIChatRequest {