From 1d908bd3732a1188b1d2fe965a283d6aa8fa6682 Mon Sep 17 00:00:00 2001
From: OpenClaw <openclaw@192.168.254.102>
Date: Mon, 9 Feb 2026 23:24:55 +0800
Subject: [PATCH] feat: add vision/image support via stream-json stdin + CLI
 isolation

Adds multimodal (text + image) support for OpenAI-compatible clients
like Browser Use that send base64 screenshots via the chat completions
endpoint.

Changes:
- Support OpenAI content arrays with text and image_url parts
- Auto-detect images and switch to --input-format stream-json mode
  (text-only requests still use the fast CLI argument path)
- Convert data URI images to Claude CLI base64 format via stdin piping
- Strip code fences from model responses (Claude wraps JSON in fences)
- Isolate CLI subprocess: --tools "", --disable-slash-commands,
  --setting-sources "", --system-prompt override, cwd /tmp
- Bump body limit to 50mb for base64 image payloads

Tested with Browser Use v0.11.9 running vision-enabled browser
automation tasks (screenshots piped as base64 PNG).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/adapter/cli-to-openai.ts |  29 +++++-
 src/adapter/openai-to-cli.ts | 183 +++++++++++++++++++++++++++++++----
 src/server/index.ts          |   2 +-
 src/server/routes.ts         |   2 +
 src/subprocess/manager.ts    |  54 ++++++++---
 src/types/openai.ts          |   8 +-
 6 files changed, 243 insertions(+), 35 deletions(-)

diff --git a/src/adapter/cli-to-openai.ts b/src/adapter/cli-to-openai.ts
index 1e43eab..20d6efc 100644
--- a/src/adapter/cli-to-openai.ts
+++ b/src/adapter/cli-to-openai.ts
@@ -5,6 +5,33 @@
 import type { ClaudeCliAssistant, ClaudeCliResult } from "../types/claude-cli.js";
 import type { OpenAIChatResponse, OpenAIChatChunk } from "../types/openai.js";
 
+/**
+ * Extract JSON content from model response.
+ * Claude often outputs prose before/after JSON in code fences.
+ * This extracts the last JSON code fence block, or falls back to
+ * finding a raw JSON object/array in the text.
+ */
+function stripCodeFences(text: string): string {
+  // Find all code fence blocks and take the last one (most likely the JSON)
+  const fenceMatches = [...text.matchAll(/```(?:\w*)\n([\s\S]*?)\n```/g)];
+  if (fenceMatches.length > 0) {
+    return fenceMatches[fenceMatches.length - 1][1];
+  }
+
+  // Try to extract a raw JSON object or array
+  const jsonMatch = text.match(/(\{[\s\S]*\}|\[[\s\S]*\])/);
+  if (jsonMatch) {
+    try {
+      JSON.parse(jsonMatch[1]);
+      return jsonMatch[1];
+    } catch {
+      // Not valid JSON, return original
+    }
+  }
+
+  return text;
+}
+
 /**
  * Extract text content from Claude CLI assistant message
  */
@@ -84,7 +111,7 @@ export function cliResultToOpenai(
         index: 0,
         message: {
           role: "assistant",
-          content: result.result,
+          content: stripCodeFences(result.result),
         },
         finish_reason: "stop",
       },
diff --git a/src/adapter/openai-to-cli.ts b/src/adapter/openai-to-cli.ts
index c8ecaa1..1fae8cf 100644
--- a/src/adapter/openai-to-cli.ts
+++ b/src/adapter/openai-to-cli.ts
@@ -1,13 +1,52 @@
 /**
  * Converts OpenAI chat request format to Claude CLI input
+ *
+ * Supports two modes:
+ * - Text-only: prompt passed as CLI argument (legacy)
+ * - Stream-JSON: NDJSON piped to stdin with full multimodal support (images)
  */
 
-import type { OpenAIChatRequest } from "../types/openai.js";
+import type { OpenAIChatRequest, OpenAIChatContentPart } from "../types/openai.js";
 
 export type ClaudeModel = "opus" | "sonnet" | "haiku";
 
+/**
+ * Claude CLI stream-json content block types
+ */
+interface CliTextContent {
+  type: "text";
+  text: string;
+}
+
+interface CliImageContent {
+  type: "image";
+  source: {
+    type: "base64";
+    media_type: string;
+    data: string;
+  };
+}
+
+type CliContentBlock = CliTextContent | CliImageContent;
+
+/**
+ * NDJSON message format for Claude CLI --input-format stream-json
+ */
+export interface CliStreamMessage {
+  type: "user";
+  message: {
+    role: "user";
+    content: CliContentBlock[];
+  };
+}
+
 export interface CliInput {
+  /** Single prompt string (legacy text-only mode) */
   prompt: string;
+  /** NDJSON lines for stdin piping (stream-json mode with image support) */
+  stdinMessages: string[];
+  /** Whether the request contains images and needs stream-json mode */
+  hasImages: boolean;
   model: ClaudeModel;
   sessionId?: string;
 }
@@ -31,45 +70,146 @@ const MODEL_MAP: Record<string, ClaudeModel> = {
  * Extract Claude model alias from request model string
  */
 export function extractModel(model: string): ClaudeModel {
-  // Try direct lookup
   if (MODEL_MAP[model]) {
     return MODEL_MAP[model];
   }
-
-  // Try stripping provider prefix
   const stripped = model.replace(/^claude-code-cli\//, "");
   if (MODEL_MAP[stripped]) {
     return MODEL_MAP[stripped];
   }
-
-  // Default to opus (Claude Max subscription)
   return "opus";
 }
 
 /**
- * Convert OpenAI messages array to a single prompt string for Claude CLI
+ * Check if any message in the request contains images
+ */
+function requestHasImages(messages: OpenAIChatRequest["messages"]): boolean {
+  return messages.some((msg) => {
+    if (typeof msg.content === "string") return false;
+    return msg.content.some((part) => part.type === "image_url");
+  });
+}
+
+/**
+ * Extract text from content (string or array of content parts).
+ */
+function extractText(content: string | OpenAIChatContentPart[]): string {
+  if (typeof content === "string") {
+    return content;
+  }
+  return content
+    .filter((part) => part.type === "text" && part.text)
+    .map((part) => part.text!)
+    .join("\n");
+}
+
+/**
+ * Convert an OpenAI image_url to a Claude CLI base64 image block.
+ * Handles data URIs (data:image/png;base64,...) and passes through the data.
+ */
+function convertImagePart(part: OpenAIChatContentPart): CliImageContent | null {
+  if (part.type !== "image_url" || !part.image_url) return null;
+
+  const url = part.image_url.url;
+
+  // Parse data URI: data:image/png;base64,iVBOR...
+  const match = url.match(/^data:(image\/[a-z+]+);base64,(.+)$/i);
+  if (!match) {
+    // Non-data-URI images (http URLs) are not supported by CLI
+    console.error("[openai-to-cli] Skipping non-data-URI image:", url.slice(0, 60));
+    return null;
+  }
+
+  return {
+    type: "image",
+    source: {
+      type: "base64",
+      media_type: match[1],
+      data: match[2],
+    },
+  };
+}
+
+/**
+ * Convert OpenAI content parts to Claude CLI content blocks
+ */
+function convertContentParts(content: string | OpenAIChatContentPart[]): CliContentBlock[] {
+  if (typeof content === "string") {
+    return [{ type: "text", text: content }];
+  }
+
+  const blocks: CliContentBlock[] = [];
+  for (const part of content) {
+    if (part.type === "text" && part.text) {
+      blocks.push({ type: "text", text: part.text });
+    } else if (part.type === "image_url") {
+      const img = convertImagePart(part);
+      if (img) blocks.push(img);
+    }
+  }
+  return blocks;
+}
+
+/**
+ * Convert OpenAI messages to Claude CLI stream-json NDJSON lines.
  *
- * Claude Code CLI in --print mode expects a single prompt, not a conversation.
- * We format the messages into a readable format that preserves context.
+ * Claude CLI stream-json only accepts "user" role messages.
+ * System and assistant messages are inlined as tagged text blocks
+ * within the user message content.
+ */
+function messagesToStreamJson(messages: OpenAIChatRequest["messages"]): string[] {
+  // Collect all content blocks into a single user message
+  // (Claude CLI stream-json expects user-role messages only)
+  const allBlocks: CliContentBlock[] = [];
+
+  for (const msg of messages) {
+    switch (msg.role) {
+      case "system": {
+        const text = extractText(msg.content);
+        allBlocks.push({ type: "text", text: `<system>\n${text}\n</system>` });
+        break;
+      }
+      case "assistant": {
+        const text = extractText(msg.content);
+        allBlocks.push({ type: "text", text: `<previous_response>\n${text}\n</previous_response>` });
+        break;
+      }
+      case "user": {
+        const blocks = convertContentParts(msg.content);
+        allBlocks.push(...blocks);
+        break;
+      }
+    }
+  }
+
+  const stdinMsg: CliStreamMessage = {
+    type: "user",
+    message: {
+      role: "user",
+      content: allBlocks,
+    },
+  };
+
+  return [JSON.stringify(stdinMsg)];
+}
+
+/**
+ * Convert OpenAI messages to a single prompt string (legacy text-only mode)
  */
 export function messagesToPrompt(messages: OpenAIChatRequest["messages"]): string {
   const parts: string[] = [];
 
   for (const msg of messages) {
+    const text = extractText(msg.content);
     switch (msg.role) {
       case "system":
-        // System messages become context instructions
-        parts.push(`<system>\n${msg.content}\n</system>\n`);
+        parts.push(`<system>\n${text}\n</system>\n`);
         break;
-
       case "user":
-        // User messages are the main prompt
-        parts.push(msg.content);
+        parts.push(text);
         break;
-
       case "assistant":
-        // Previous assistant responses for context
-        parts.push(`<previous_response>\n${msg.content}\n</previous_response>\n`);
+        parts.push(`<previous_response>\n${text}\n</previous_response>\n`);
         break;
     }
   }
@@ -78,12 +218,17 @@ export function messagesToPrompt(messages: OpenAIChatRequest["messages"]): strin
 }
 
 /**
- * Convert OpenAI chat request to CLI input format
+ * Convert OpenAI chat request to CLI input format.
+ * Automatically chooses stream-json mode when images are present.
  */
 export function openaiToCli(request: OpenAIChatRequest): CliInput {
+  const hasImages = requestHasImages(request.messages);
+
   return {
     prompt: messagesToPrompt(request.messages),
+    stdinMessages: messagesToStreamJson(request.messages),
+    hasImages,
     model: extractModel(request.model),
-    sessionId: request.user, // Use OpenAI's user field for session mapping
+    sessionId: request.user,
   };
 }
diff --git a/src/server/index.ts b/src/server/index.ts
index de8b73d..bafdbd5 100644
--- a/src/server/index.ts
+++ b/src/server/index.ts
@@ -22,7 +22,7 @@ function createApp(): Express {
   const app = express();
 
   // Middleware
-  app.use(express.json({ limit: "10mb" }));
+  app.use(express.json({ limit: "50mb" })); // Large limit for base64 images
 
   // Request logging (debug mode)
   app.use((req: Request, _res: Response, next: NextFunction) => {
diff --git a/src/server/routes.ts b/src/server/routes.ts
index ffe2e5b..02583f2 100644
--- a/src/server/routes.ts
+++ b/src/server/routes.ts
@@ -179,6 +179,7 @@ async function handleStreamingResponse(
     subprocess.start(cliInput.prompt, {
       model: cliInput.model,
       sessionId: cliInput.sessionId,
+      stdinMessages: cliInput.hasImages ? cliInput.stdinMessages : undefined,
     }).catch((err) => {
       console.error("[Streaming] Subprocess start error:", err);
       reject(err);
@@ -234,6 +235,7 @@ async function handleNonStreamingResponse(
       .start(cliInput.prompt, {
         model: cliInput.model,
         sessionId: cliInput.sessionId,
+        stdinMessages: cliInput.hasImages ? cliInput.stdinMessages : undefined,
       })
       .catch((error) => {
         res.status(500).json({
diff --git a/src/subprocess/manager.ts b/src/subprocess/manager.ts
index 6551a81..5bdd9af 100644
--- a/src/subprocess/manager.ts
+++ b/src/subprocess/manager.ts
@@ -43,17 +43,25 @@ export class ClaudeSubprocess extends EventEmitter {
   private isKilled: boolean = false;
 
   /**
-   * Start the Claude CLI subprocess with the given prompt
+   * Start the Claude CLI subprocess.
+   *
+   * Two modes:
+   * - Text mode (stdinMessages omitted): prompt passed as CLI argument
+   * - Stream-JSON mode (stdinMessages provided): messages piped via stdin (supports images)
    */
-  async start(prompt: string, options: SubprocessOptions): Promise<void> {
-    const args = this.buildArgs(prompt, options);
+  async start(
+    prompt: string,
+    options: SubprocessOptions & { stdinMessages?: string[] }
+  ): Promise<void> {
+    const useStreamInput = !!options.stdinMessages?.length;
+    const args = this.buildArgs(useStreamInput ? null : prompt, options, useStreamInput);
     const timeout = options.timeout || DEFAULT_TIMEOUT;
 
     return new Promise((resolve, reject) => {
       try {
         // Use spawn() for security - no shell interpretation
         this.process = spawn("claude", args, {
-          cwd: options.cwd || process.cwd(),
+          cwd: options.cwd || "/tmp", // Use neutral dir to avoid loading CLAUDE.md
           env: { ...process.env },
           stdio: ["pipe", "pipe", "pipe"],
         });
@@ -81,10 +89,18 @@ export class ClaudeSubprocess extends EventEmitter {
           }
         });
 
-        // Close stdin since we pass prompt as argument
+        // Pipe messages via stdin or close it
+        if (useStreamInput && options.stdinMessages) {
+          for (const line of options.stdinMessages) {
+            this.process.stdin?.write(line + "\n");
+          }
+        }
         this.process.stdin?.end();
 
-        console.error(`[Subprocess] Process spawned with PID: ${this.process.pid}`);
+        console.error(
+          `[Subprocess] Process spawned with PID: ${this.process.pid}` +
+          ` (mode: ${useStreamInput ? "stream-json" : "text"})`
+        );
 
         // Parse JSON stream from stdout
         this.process.stdout?.on("data", (chunk: Buffer) => {
@@ -98,8 +114,6 @@ export class ClaudeSubprocess extends EventEmitter {
         this.process.stderr?.on("data", (chunk: Buffer) => {
           const errorText = chunk.toString().trim();
           if (errorText) {
-            // Don't emit as error unless it's actually an error
-            // Claude CLI may write debug info to stderr
             console.error("[Subprocess stderr]:", errorText.slice(0, 200));
           }
         });
@@ -108,14 +122,12 @@ export class ClaudeSubprocess extends EventEmitter {
         this.process.on("close", (code) => {
           console.error(`[Subprocess] Process closed with code: ${code}`);
           this.clearTimeout();
-          // Process any remaining buffer
           if (this.buffer.trim()) {
             this.processBuffer();
           }
           this.emit("close", code);
         });
 
-        // Resolve immediately since we're streaming
         resolve();
       } catch (err) {
         this.clearTimeout();
@@ -125,9 +137,16 @@ export class ClaudeSubprocess extends EventEmitter {
   }
 
   /**
-   * Build CLI arguments array
+   * Build CLI arguments array.
+   * @param prompt - Text prompt (null when using stream-json input)
+   * @param options - Subprocess options
+   * @param streamInput - Whether to use --input-format stream-json
    */
-  private buildArgs(prompt: string, options: SubprocessOptions): string[] {
+  private buildArgs(
+    prompt: string | null,
+    options: SubprocessOptions,
+    streamInput: boolean
+  ): string[] {
     const args = [
       "--print", // Non-interactive mode
       "--output-format",
@@ -137,9 +156,18 @@ export class ClaudeSubprocess extends EventEmitter {
       "--model",
       options.model, // Model alias (opus/sonnet/haiku)
       "--no-session-persistence", // Don't save sessions
-      prompt, // Pass prompt as argument (more reliable than stdin)
+      "--tools", "", // Disable all tools — act as a plain chat model
+      "--disable-slash-commands", // Disable skills
+      "--setting-sources", "", // Don't load any settings files (CLAUDE.md etc.)
+      "--system-prompt", "You are a helpful assistant. Follow the user's instructions precisely. When asked to output JSON, output ONLY raw JSON without markdown code fences.",
     ];
 
+    if (streamInput) {
+      args.push("--input-format", "stream-json");
+    } else if (prompt) {
+      args.push(prompt);
+    }
+
     if (options.sessionId) {
       args.push("--session-id", options.sessionId);
     }
diff --git a/src/types/openai.ts b/src/types/openai.ts
index c116658..7455fa1 100644
--- a/src/types/openai.ts
+++ b/src/types/openai.ts
@@ -3,9 +3,15 @@
  * Used for Clawdbot integration
  */
 
+export interface OpenAIChatContentPart {
+  type: "text" | "image_url";
+  text?: string;
+  image_url?: { url: string; detail?: string };
+}
+
 export interface OpenAIChatMessage {
   role: "system" | "user" | "assistant";
-  content: string;
+  content: string | OpenAIChatContentPart[];
 }
 
 export interface OpenAIChatRequest {