diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2719674..2b0c50d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,10 +103,12 @@ add_library(rcli STATIC
     src/engines/metalrt_engine.cpp
     src/engines/metalrt_stt_engine.cpp
     src/engines/metalrt_tts_engine.cpp
+    src/engines/metalrt_vlm_engine.cpp
     src/engines/vlm_engine.cpp
     src/audio/audio_io.cpp
     src/audio/mic_permission.mm
     src/audio/camera_capture.mm
+    src/audio/camera_preview.mm
     src/audio/screen_capture.mm
     src/pipeline/orchestrator.cpp
     src/pipeline/sentence_detector.cpp
@@ -140,7 +142,7 @@ add_library(rcli STATIC
     src/api/rcli_api.cpp
 )
 
-set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm
+set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/camera_preview.mm src/audio/screen_capture.mm
     PROPERTIES LANGUAGE CXX)
 
 target_include_directories(rcli PUBLIC
@@ -221,6 +223,32 @@ set_target_properties(rcli_overlay PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
 )
 
+# =============================================================================
+# rcli_camera_preview — standalone Cocoa helper for live camera preview window
+# =============================================================================
+add_executable(rcli_camera_preview
+    src/audio/rcli_camera_preview.m
+)
+
+set_source_files_properties(src/audio/rcli_camera_preview.m PROPERTIES LANGUAGE CXX)
+
+target_compile_options(rcli_camera_preview PRIVATE -x objective-c++)
+
+target_link_libraries(rcli_camera_preview PRIVATE
+    "-framework AppKit"
+    "-framework AVFoundation"
+    "-framework CoreMedia"
+    "-framework CoreVideo"
+    "-framework CoreImage"
+    "-framework ImageIO"
+    "-framework UniformTypeIdentifiers"
+)
+
+set_target_properties(rcli_camera_preview PROPERTIES
+    OUTPUT_NAME "rcli_camera_preview"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
 # =============================================================================
 # rcli_test — test executable
 # =============================================================================
diff --git a/Formula/rcli.rb b/Formula/rcli.rb
index 7937734..aad98ef 100644
--- a/Formula/rcli.rb
+++ b/Formula/rcli.rb
@@ -12,6 +12,7 @@ class Rcli < Formula
   def install
     bin.install "bin/rcli"
     bin.install "bin/rcli_overlay" if File.exist? "bin/rcli_overlay"
+    bin.install "bin/rcli_camera_preview" if File.exist? "bin/rcli_camera_preview"
     lib.install Dir["lib/*.dylib"]
   end
 
diff --git a/install.sh b/install.sh
index bd5880b..12ca047 100755
--- a/install.sh
+++ b/install.sh
@@ -70,6 +70,7 @@ else
     mkdir -p "$CELLAR/bin" "$CELLAR/lib" 2>/dev/null || sudo mkdir -p "$CELLAR/bin" "$CELLAR/lib"
     cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/"
     cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || true
+    cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || true
     cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/"
 
     brew link --overwrite "$FORMULA" 2>/dev/null || sudo brew link --overwrite "$FORMULA"
diff --git a/scripts/package.sh b/scripts/package.sh
index e238737..1f29c9f 100755
--- a/scripts/package.sh
+++ b/scripts/package.sh
@@ -27,6 +27,10 @@ if [ -f "$BUILD_DIR/rcli_overlay" ]; then
     cp "$BUILD_DIR/rcli_overlay" "$DIST_DIR/bin/rcli_overlay"
     echo "  + bin/rcli_overlay"
 fi
+if [ -f "$BUILD_DIR/rcli_camera_preview" ]; then
+    cp "$BUILD_DIR/rcli_camera_preview" "$DIST_DIR/bin/rcli_camera_preview"
+    echo "  + bin/rcli_camera_preview"
+fi
 
 # --- Collect dylibs ---
 DYLIBS=(
@@ -155,6 +159,9 @@ codesign --force --sign - "$BINARY"
 if [ -f "$DIST_DIR/bin/rcli_overlay" ]; then
     codesign --force --sign - "$DIST_DIR/bin/rcli_overlay"
 fi
+if [ -f "$DIST_DIR/bin/rcli_camera_preview" ]; then
+    codesign --force --sign - "$DIST_DIR/bin/rcli_camera_preview"
+fi
 for lib in "$DIST_DIR/lib/"*.dylib; do
     codesign --force --sign - "$lib"
 done
diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp
index f292c78..aaf2d1d 100644
--- a/src/api/rcli_api.cpp
+++ b/src/api/rcli_api.cpp
@@ -41,6 +41,7 @@ extern char** environ;
 #include "actions/action_registry.h"
 #include "actions/macos_actions.h"
 #include "engines/vlm_engine.h"
+#include "engines/metalrt_vlm_engine.h"
 #include "models/vlm_model_registry.h"
 
 using namespace rastack;
@@ -117,10 +118,12 @@ struct RCLIEngine {
     int ctx_main_prompt_tokens = 0;
 
     // VLM (Vision Language Model) subsystem
-    VlmEngine vlm_engine;
+    VlmEngine vlm_engine;                 // llama.cpp backend
+    MetalRTVlmEngine metalrt_vlm_engine;  // MetalRT backend
     bool vlm_initialized = false;
+    bool vlm_use_metalrt = false;         // which backend is active
     std::string last_vlm_response;
-    std::string vlm_backend_name;         // "llama.cpp (Metal GPU)" or "MetalRT"
+    std::string vlm_backend_name;         // "llama.cpp (Metal GPU)" or "MetalRT (Metal GPU)"
     std::string vlm_model_name;           // e.g. "Qwen3 VL 2B"
 
     std::mutex mutex;
@@ -1065,8 +1068,7 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u
     if (!engine->vlm_initialized) {
         if (vlm_init_locked(engine) != 0) {
             return "I can see you're asking about your screen, but VLM isn't available. "
-                   "It requires the llama.cpp engine and a VLM model. "
-                   "Switch with: rcli engine llamacpp, then download a model: rcli models vlm";
+                   "Download a VLM model with: rcli models vlm";
         }
     }
 
@@ -1076,7 +1078,12 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u
         vlm_prompt = "Describe what you see on this screen in detail.";
     }
 
-    std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr);
+    std::string result;
+    if (engine->vlm_use_metalrt) {
+        result = engine->metalrt_vlm_engine.analyze_image(path, vlm_prompt);
+    } else {
+        result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr);
+    }
 
     if (result.empty()) {
         return "I captured your screen but the analysis failed. Please try again.";
@@ -2983,8 +2990,47 @@ static bool safe_download(const std::string& url, const std::string& dest) {
     return WIFEXITED(status) && WEXITSTATUS(status) == 0;
 }
 
+// Find a MetalRT VLM model directory (MLX-format weights).
+// Searches HuggingFace cache for known models.
+static std::string find_metalrt_vlm_model_dir() {
+    const char* home = getenv("HOME");
+    if (!home) return "";
+
+    static const char* hf_repos[] = {
+        "models--mlx-community--Qwen3-VL-2B-Instruct-4bit",
+        "models--mlx-community--LFM2.5-VL-1.6B-MLX-6bit",
+    };
+
+    struct stat st;
+    std::string hf_base = std::string(home) + "/.cache/huggingface/hub";
+
+    for (const char* repo : hf_repos) {
+        std::string snapshots_dir = hf_base + "/" + repo + "/snapshots";
+        if (stat(snapshots_dir.c_str(), &st) != 0) continue;
+
+        FILE* p = popen(("ls -1t '" + snapshots_dir + "' 2>/dev/null | head -1").c_str(), "r");
+        if (!p) continue;
+        char buf[256];
+        if (!fgets(buf, sizeof(buf), p)) { pclose(p); continue; }
+        pclose(p);
+
+        std::string snap(buf);
+        while (!snap.empty() && (snap.back() == '\n' || snap.back() == '\r'))
+            snap.pop_back();
+        if (snap.empty()) continue;
+
+        std::string model_dir = snapshots_dir + "/" + snap;
+        std::string safetensors = model_dir + "/model.safetensors";
+        if (stat(safetensors.c_str(), &st) == 0) {
+            LOG_DEBUG("VLM", "Found MetalRT VLM model at %s", model_dir.c_str());
+            return model_dir;
+        }
+    }
+
+    return "";
+}
+
 // Internal init (caller must hold engine->mutex)
-// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon.
 static int vlm_init_locked(RCLIEngine* engine) {
     if (engine->vlm_initialized) return 0;
 
@@ -2995,13 +3041,34 @@ static int vlm_init_locked(RCLIEngine* engine) {
             engine->models_dir = "./models";
     }
 
-    // VLM requires the llama.cpp engine
+    // --- Try MetalRT VLM backend first (when on MetalRT engine) ---
     if (engine->initialized && engine->pipeline.using_metalrt()) {
-        LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp");
-        return -1;
+        auto& loader = MetalRTLoader::instance();
+        if (loader.is_loaded() && loader.has_vision()) {
+            std::string model_dir = find_metalrt_vlm_model_dir();
+            if (!model_dir.empty()) {
+                MetalRTVlmConfig mrt_config;
+                mrt_config.model_dir = model_dir;
+                if (engine->metalrt_vlm_engine.init(mrt_config)) {
+                    engine->vlm_initialized = true;
+                    engine->vlm_use_metalrt = true;
+                    engine->vlm_backend_name = "MetalRT (Metal GPU)";
+                    engine->vlm_model_name = engine->metalrt_vlm_engine.model_name();
+                    if (engine->vlm_model_name.empty())
+                        engine->vlm_model_name = "Qwen3 VL 2B";
+                    LOG_INFO("VLM", "VLM engine ready — %s via MetalRT (Metal GPU)",
+                             engine->vlm_model_name.c_str());
+                    return 0;
+                }
+                LOG_WARN("VLM", "MetalRT VLM init failed, falling back to llama.cpp");
+            } else {
+                LOG_WARN("VLM", "No MetalRT VLM model found in HF cache, falling back to llama.cpp");
+            }
+        }
+        // Fall through to llama.cpp instead of hard-failing
     }
 
-    // Check if any VLM model is installed (on-demand, no auto-download)
+    // --- llama.cpp VLM backend ---
     auto vlm_models = rcli::all_vlm_models();
     rcli::VlmModelDef model_def;
     bool found = false;
@@ -3019,7 +3086,6 @@ static int vlm_init_locked(RCLIEngine* engine) {
         return -1;
     }
 
-    // Initialize VLM engine with the installed model
     VlmConfig config;
     config.model_path  = engine->models_dir + "/" + model_def.model_filename;
     config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename;
@@ -3036,6 +3102,7 @@ static int vlm_init_locked(RCLIEngine* engine) {
     }
 
     engine->vlm_initialized = true;
+    engine->vlm_use_metalrt = false;
     engine->vlm_backend_name = "llama.cpp (Metal GPU)";
     engine->vlm_model_name = model_def.name;
     LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str());
@@ -3056,7 +3123,7 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch
 
     if (!engine->vlm_initialized) {
         if (vlm_init_locked(engine) != 0) {
-            engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm).";
+            engine->last_vlm_response = "VLM not available. Download a VLM model with: rcli models vlm";
             return engine->last_vlm_response.c_str();
         }
     }
@@ -3065,16 +3132,17 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch
         ? std::string(prompt)
         : "Describe this image in detail.";
 
-    {
-        std::string result = engine->vlm_engine.analyze_image(
+    std::string result;
+    if (engine->vlm_use_metalrt) {
+        result = engine->metalrt_vlm_engine.analyze_image(
+            std::string(image_path), text_prompt);
+    } else {
+        result = engine->vlm_engine.analyze_image(
             std::string(image_path), text_prompt, nullptr);
-
-        if (result.empty()) {
-            engine->last_vlm_response = "Error: Failed to analyze image.";
-        } else {
-            engine->last_vlm_response = result;
-        }
     }
+
+    engine->last_vlm_response = result.empty()
+        ? "Error: Failed to analyze image." : result;
     return engine->last_vlm_response.c_str();
 }
 
@@ -3101,12 +3169,21 @@ int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) {
     auto* engine = static_cast<RCLIEngine*>(handle);
     if (!engine->vlm_initialized) return -1;
 
-    auto& s = engine->vlm_engine.last_stats();
-    out_stats->gen_tok_per_sec  = s.gen_tps();
-    out_stats->generated_tokens = static_cast<int>(s.generated_tokens);
-    out_stats->total_time_sec   = (s.image_encode_us + s.generation_us) / 1e6;
-    out_stats->image_encode_ms  = s.image_encode_us / 1000.0;
-    out_stats->first_token_ms   = s.first_token_us / 1000.0;
+    if (engine->vlm_use_metalrt) {
+        auto& s = engine->metalrt_vlm_engine.last_stats();
+        out_stats->gen_tok_per_sec  = s.tps;
+        out_stats->generated_tokens = s.generated_tokens;
+        out_stats->total_time_sec   = (s.vision_encode_ms + s.prefill_ms + s.decode_ms) / 1000.0;
+        out_stats->image_encode_ms  = s.vision_encode_ms;
+        out_stats->first_token_ms   = s.prefill_ms;
+    } else {
+        auto& s = engine->vlm_engine.last_stats();
+        out_stats->gen_tok_per_sec  = s.gen_tps();
+        out_stats->generated_tokens = static_cast<int>(s.generated_tokens);
+        out_stats->total_time_sec   = (s.image_encode_us + s.generation_us) / 1e6;
+        out_stats->image_encode_ms  = s.image_encode_us / 1000.0;
+        out_stats->first_token_ms   = s.first_token_us / 1000.0;
+    }
     return 0;
 }
 
@@ -3128,11 +3205,16 @@ int rcli_vlm_exit(RCLIHandle handle) {
     auto* engine = static_cast<RCLIEngine*>(handle);
     std::lock_guard<std::mutex> lock(engine->mutex);
 
-    if (engine->vlm_engine.is_initialized()) {
-        engine->vlm_engine.shutdown();
+    if (engine->vlm_use_metalrt) {
+        if (engine->metalrt_vlm_engine.is_initialized())
+            engine->metalrt_vlm_engine.shutdown();
+    } else {
+        if (engine->vlm_engine.is_initialized())
+            engine->vlm_engine.shutdown();
     }
 
     engine->vlm_initialized = false;
+    engine->vlm_use_metalrt = false;
     engine->vlm_backend_name.clear();
     engine->vlm_model_name.clear();
     LOG_INFO("VLM", "VLM unloaded");
@@ -3157,29 +3239,58 @@ int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
     std::string text_prompt = (prompt && prompt[0])
         ? std::string(prompt) : "Describe this image in detail.";
 
-    // llama.cpp VLM streaming path
-    rastack::TokenCallback token_cb = nullptr;
-    if (callback) {
-        token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
-            if (!tok.text.empty()) {
-                callback("token", tok.text.c_str(), user_data);
-            }
-        };
-    }
+    std::string result;
+
+    if (engine->vlm_use_metalrt) {
+        // MetalRT VLM streaming path
+        rastack::TokenCallback token_cb = nullptr;
+        if (callback) {
+            token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
+                if (!tok.text.empty()) {
+                    callback("token", tok.text.c_str(), user_data);
+                }
+            };
+        }
 
-    std::string result = engine->vlm_engine.analyze_image(
-        std::string(image_path), text_prompt, token_cb);
+        result = engine->metalrt_vlm_engine.analyze_image_stream(
+            std::string(image_path), text_prompt, token_cb);
+
+        if (callback) {
+            auto& s = engine->metalrt_vlm_engine.last_stats();
+            char stats_buf[256];
+            snprintf(stats_buf, sizeof(stats_buf),
+                     "{\"tps\":%.1f,\"tokens\":%d,\"vision_encode_ms\":%.1f}",
+                     s.tps, s.generated_tokens, s.vision_encode_ms);
+            callback("stats", stats_buf, user_data);
+        }
+    } else {
+        // llama.cpp VLM streaming path
+        rastack::TokenCallback token_cb = nullptr;
+        if (callback) {
+            token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
+                if (!tok.text.empty()) {
+                    callback("token", tok.text.c_str(), user_data);
+                }
+            };
+        }
+
+        result = engine->vlm_engine.analyze_image(
+            std::string(image_path), text_prompt, token_cb);
+
+        if (callback) {
+            auto& s = engine->vlm_engine.last_stats();
+            char stats_buf[256];
+            snprintf(stats_buf, sizeof(stats_buf),
+                     "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}",
+                     s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0);
+            callback("stats", stats_buf, user_data);
+        }
+    }
 
     engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
 
     if (callback) {
         callback("response", engine->last_vlm_response.c_str(), user_data);
-        auto& s = engine->vlm_engine.last_stats();
-        char stats_buf[256];
-        snprintf(stats_buf, sizeof(stats_buf),
-                 "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}",
-                 s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0);
-        callback("stats", stats_buf, user_data);
     }
 
     return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0;
diff --git a/src/audio/camera_preview.h b/src/audio/camera_preview.h
new file mode 100644
index 0000000..fb822de
--- /dev/null
+++ b/src/audio/camera_preview.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Launch the camera preview window (floating PIP with live feed).
+// Returns 0 on success, -1 on failure.
+int camera_preview_start(void);
+
+// Stop the camera preview window and clean up.
+void camera_preview_stop(void);
+
+// Returns 1 if the camera preview is currently running.
+int camera_preview_active(void);
+
+// Freeze the live feed and capture the current frame to a JPEG file.
+// Returns 0 on success, -1 on failure.
+int camera_preview_capture(const char* output_path);
+
+// Capture the current frame to a JPEG file WITHOUT freezing the live feed.
+// The camera keeps streaming. Ideal for auto-analysis loops.
+// Returns 0 on success, -1 on failure.
+int camera_preview_snap(const char* output_path);
+
+// Freeze the live feed (without capturing). Shows "FROZEN" badge.
+// Returns 0 on success, -1 on failure.
+int camera_preview_freeze(void);
+
+// Resume the live camera feed after a freeze.
+// Returns 0 on success, -1 on failure.
+int camera_preview_unfreeze(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/audio/camera_preview.mm b/src/audio/camera_preview.mm
new file mode 100644
index 0000000..ff7cb95
--- /dev/null
+++ b/src/audio/camera_preview.mm
@@ -0,0 +1,124 @@
+#include "camera_preview.h"
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <atomic>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <mach-o/dyld.h>
+
+static pid_t g_cam_pid = 0;
+static FILE *g_cam_stdin  = nullptr;
+static FILE *g_cam_stdout = nullptr;
+static std::atomic<bool> g_cam_active{false};
+
+static std::string find_camera_preview_binary() {
+    char path[1024];
+    uint32_t size = sizeof(path);
+    if (_NSGetExecutablePath(path, &size) == 0) {
+        std::string dir(path);
+        auto slash = dir.rfind('/');
+        if (slash != std::string::npos) {
+            std::string candidate = dir.substr(0, slash + 1) + "rcli_camera_preview";
+            if (access(candidate.c_str(), X_OK) == 0) return candidate;
+        }
+    }
+    return "rcli_camera_preview";
+}
+
+static std::string cam_cmd(const char* cmd) {
+    if (!g_cam_stdin || !g_cam_stdout) return "";
+    fprintf(g_cam_stdin, "%s\n", cmd);
+    fflush(g_cam_stdin);
+    char buf[256] = {0};
+    if (fgets(buf, sizeof(buf), g_cam_stdout)) {
+        size_t len = strlen(buf);
+        if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0';
+        return std::string(buf);
+    }
+    return "";
+}
+
+int camera_preview_start(void) {
+    if (g_cam_pid > 0) return 0;
+
+    std::string binary = find_camera_preview_binary();
+
+    int pipe_in[2], pipe_out[2];
+    if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return -1;
+
+    pid_t pid = fork();
+    if (pid == 0) {
+        close(pipe_in[1]);
+        close(pipe_out[0]);
+        dup2(pipe_in[0], STDIN_FILENO);
+        dup2(pipe_out[1], STDOUT_FILENO);
+        close(pipe_in[0]);
+        close(pipe_out[1]);
+        int devnull = open("/dev/null", O_WRONLY);
+        if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); }
+        execl(binary.c_str(), "rcli_camera_preview", nullptr);
+        _exit(1);
+    }
+
+    close(pipe_in[0]);
+    close(pipe_out[1]);
+    g_cam_pid = pid;
+    g_cam_stdin  = fdopen(pipe_in[1], "w");
+    g_cam_stdout = fdopen(pipe_out[0], "r");
+
+    char buf[64] = {0};
+    if (g_cam_stdout && fgets(buf, sizeof(buf), g_cam_stdout)) {
+        g_cam_active.store(true);
+        return 0;
+    }
+
+    camera_preview_stop();
+    return -1;
+}
+
+void camera_preview_stop(void) {
+    if (g_cam_pid <= 0) return;
+
+    cam_cmd("quit");
+
+    if (g_cam_stdin)  { fclose(g_cam_stdin);  g_cam_stdin  = nullptr; }
+    if (g_cam_stdout) { fclose(g_cam_stdout); g_cam_stdout = nullptr; }
+    int status;
+    waitpid(g_cam_pid, &status, 0);
+    g_cam_pid = 0;
+    g_cam_active.store(false);
+}
+
+int camera_preview_active(void) {
+    return g_cam_active.load() ? 1 : 0;
+}
+
+int camera_preview_capture(const char* output_path) {
+    if (!g_cam_active.load()) return -1;
+    std::string cmd = std::string("capture ") + output_path;
+    std::string resp = cam_cmd(cmd.c_str());
+    return (resp == "ok") ? 0 : -1;
+}
+
+int camera_preview_snap(const char* output_path) {
+    if (!g_cam_active.load()) return -1;
+    std::string cmd = std::string("snap ") + output_path;
+    std::string resp = cam_cmd(cmd.c_str());
+    return (resp == "ok") ? 0 : -1;
+}
+
+int camera_preview_freeze(void) {
+    if (!g_cam_active.load()) return -1;
+    std::string resp = cam_cmd("freeze");
+    return (resp == "ok") ? 0 : -1;
+}
+
+int camera_preview_unfreeze(void) {
+    if (!g_cam_active.load()) return -1;
+    std::string resp = cam_cmd("unfreeze");
+    return (resp == "ok") ? 0 : -1;
+}
diff --git a/src/audio/rcli_camera_preview.m b/src/audio/rcli_camera_preview.m
new file mode 100644
index 0000000..c6ce993
--- /dev/null
+++ b/src/audio/rcli_camera_preview.m
@@ -0,0 +1,337 @@
+// rcli_camera_preview — standalone Cocoa app showing a live camera preview
+// in a floating PIP-style window. Communicates with parent RCLI via stdin/stdout.
+//
+// Commands (one per line on stdin):
+//   capture <path>  → freezes frame, saves JPEG to <path>, replies "ok\n"
+//   snap <path>     → saves JPEG to <path> WITHOUT freezing, replies "ok\n"
+//   freeze          → pauses the live feed on current frame, replies "ok\n"
+//   unfreeze        → resumes live camera feed, replies "ok\n"
+//   quit            → exits
+
+#import <AppKit/AppKit.h>
+#import <AVFoundation/AVFoundation.h>
+#import <UniformTypeIdentifiers/UniformTypeIdentifiers.h>
+
+// ── Camera preview window ─────────────────────────────────────────────
+
+@interface CameraPreviewView : NSView <AVCaptureVideoDataOutputSampleBufferDelegate> {
+    AVCaptureSession *_session;
+    AVCaptureVideoDataOutput *_output;
+    dispatch_queue_t _captureQueue;
+    CIContext *_ciContext;
+    CGImageRef _currentFrame;
+    BOOL _frozen;
+    NSString *_pendingCapturePath;
+    NSLock *_frameLock;
+}
+@property (nonatomic, strong) NSTextField *statusLabel;
+@end
+
+@implementation CameraPreviewView
+
+- (instancetype)initWithFrame:(NSRect)frame {
+    self = [super initWithFrame:frame];
+    if (self) {
+        _frameLock = [[NSLock alloc] init];
+        _ciContext = [CIContext contextWithOptions:nil];
+        _frozen = NO;
+        _currentFrame = NULL;
+        self.wantsLayer = YES;
+        self.layer.cornerRadius = 12;
+        self.layer.masksToBounds = YES;
+        self.layer.backgroundColor = [NSColor blackColor].CGColor;
+
+        _statusLabel = [[NSTextField alloc] initWithFrame:NSZeroRect];
+        _statusLabel.stringValue = @"  RCLI Camera  ";
+        _statusLabel.font = [NSFont systemFontOfSize:11 weight:NSFontWeightHeavy];
+        _statusLabel.textColor = [NSColor blackColor];
+        _statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0];
+        _statusLabel.bezeled = NO;
+        _statusLabel.editable = NO;
+        _statusLabel.selectable = NO;
+        _statusLabel.alignment = NSTextAlignmentCenter;
+        _statusLabel.wantsLayer = YES;
+        _statusLabel.layer.cornerRadius = 8;
+        _statusLabel.layer.masksToBounds = YES;
+        [_statusLabel sizeToFit];
+        [self addSubview:_statusLabel];
+
+        [self startCamera];
+    }
+    return self;
+}
+
+- (void)layout {
+    [super layout];
+    NSSize sz = _statusLabel.frame.size;
+    CGFloat x = (self.bounds.size.width - sz.width) / 2;
+    CGFloat y = self.bounds.size.height - sz.height - 8;
+    _statusLabel.frame = NSMakeRect(x, y, sz.width + 8, sz.height + 4);
+}
+
+- (void)startCamera {
+    _session = [[AVCaptureSession alloc] init];
+    _session.sessionPreset = AVCaptureSessionPresetHigh;
+
+    AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+    if (!device) return;
+
+    NSError *err = nil;
+    AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&err];
+    if (!input) return;
+    if ([_session canAddInput:input]) [_session addInput:input];
+
+    _output = [[AVCaptureVideoDataOutput alloc] init];
+    _output.videoSettings = @{(id)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)};
+    _output.alwaysDiscardsLateVideoFrames = YES;
+    _captureQueue = dispatch_queue_create("camera.preview", DISPATCH_QUEUE_SERIAL);
+    [_output setSampleBufferDelegate:self queue:_captureQueue];
+    if ([_session canAddOutput:_output]) [_session addOutput:_output];
+
+    [_session startRunning];
+}
+
+- (void)captureOutput:(AVCaptureOutput *)output
+didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+       fromConnection:(AVCaptureConnection *)connection {
+    if (_frozen) return;
+
+    CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+    if (!imageBuffer) return;
+
+    CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer];
+    CGRect extent = ciImage.extent;
+    CGImageRef cgImage = [_ciContext createCGImage:ciImage fromRect:extent];
+    if (!cgImage) return;
+
+    [_frameLock lock];
+    if (_currentFrame) CGImageRelease(_currentFrame);
+    _currentFrame = cgImage;
+    [_frameLock unlock];
+
+    dispatch_async(dispatch_get_main_queue(), ^{
+        [self setNeedsDisplay:YES];
+    });
+}
+
+- (void)drawRect:(NSRect)dirtyRect {
+    [[NSColor blackColor] set];
+    NSRectFill(dirtyRect);
+
+    [_frameLock lock];
+    CGImageRef frame = _currentFrame;
+    if (frame) CGImageRetain(frame);
+    [_frameLock unlock];
+
+    if (frame) {
+        NSGraphicsContext *ctx = [NSGraphicsContext currentContext];
+        CGContextRef cgctx = (CGContextRef)[ctx CGContext];
+
+        CGFloat imgW = CGImageGetWidth(frame);
+        CGFloat imgH = CGImageGetHeight(frame);
+        CGFloat viewW = self.bounds.size.width;
+        CGFloat viewH = self.bounds.size.height;
+
+        CGFloat scale = fmax(viewW / imgW, viewH / imgH);
+        CGFloat drawW = imgW * scale;
+        CGFloat drawH = imgH * scale;
+        CGFloat drawX = (viewW - drawW) / 2;
+        CGFloat drawY = (viewH - drawH) / 2;
+
+        CGContextDrawImage(cgctx, CGRectMake(drawX, drawY, drawW, drawH), frame);
+        CGImageRelease(frame);
+    }
+
+    if (_frozen) {
+        [[NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:0.08] set];
+        NSRectFillUsingOperation(self.bounds, NSCompositingOperationSourceOver);
+    }
+
+    // Green border
+    NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0];
+    NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:NSInsetRect(self.bounds, 2, 2)
+                                                           xRadius:12 yRadius:12];
+    [border setLineWidth:4];
+    [green set];
+    [border stroke];
+}
+
+- (BOOL)saveFrameToPath:(NSString *)path {
+    [_frameLock lock];
+    CGImageRef frame = _currentFrame;
+    if (frame) CGImageRetain(frame);
+    [_frameLock unlock];
+
+    if (!frame) return NO;
+
+    NSURL *url = [NSURL fileURLWithPath:path];
+    CGImageDestinationRef dest = CGImageDestinationCreateWithURL(
+        (__bridge CFURLRef)url, (__bridge CFStringRef)UTTypeJPEG.identifier, 1, NULL);
+    if (!dest) { CGImageRelease(frame); return NO; }
+
+    NSDictionary *opts = @{(__bridge id)kCGImageDestinationLossyCompressionQuality: @(0.92)};
+    CGImageDestinationAddImage(dest, frame, (__bridge CFDictionaryRef)opts);
+    BOOL ok = CGImageDestinationFinalize(dest);
+    CFRelease(dest);
+    CGImageRelease(frame);
+    return ok;
+}
+
+- (void)freeze {
+    _frozen = YES;
+    dispatch_async(dispatch_get_main_queue(), ^{
+        self.statusLabel.stringValue = @"  FROZEN  ";
+        self.statusLabel.backgroundColor = [NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:1.0];
+        [self.statusLabel sizeToFit];
+        [self layout];
+        [self setNeedsDisplay:YES];
+    });
+}
+
+- (void)unfreeze {
+    _frozen = NO;
+    dispatch_async(dispatch_get_main_queue(), ^{
+        self.statusLabel.stringValue = @"  RCLI Camera  ";
+        self.statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0];
+        [self.statusLabel sizeToFit];
+        [self layout];
+        [self setNeedsDisplay:YES];
+    });
+}
+
+- (void)stopCamera {
+    [_session stopRunning];
+}
+
+- (void)dealloc {
+    [_frameLock lock];
+    if (_currentFrame) CGImageRelease(_currentFrame);
+    _currentFrame = NULL;
+    [_frameLock unlock];
+    [super dealloc];
+}
+
+@end
+
+// ── Camera window ─────────────────────────────────────────────────────
+
+@interface CameraWindow : NSWindow
+@end
+
+@implementation CameraWindow
+
+- (instancetype)initWithRect:(NSRect)rect {
+    self = [super initWithContentRect:rect
+                            styleMask:NSWindowStyleMaskBorderless |
+                                      NSWindowStyleMaskResizable
+                              backing:NSBackingStoreBuffered
+                                defer:NO];
+    if (self) {
+        self.opaque = NO;
+        self.backgroundColor = [NSColor clearColor];
+        self.level = NSFloatingWindowLevel;
+        self.hasShadow = YES;
+        self.movableByWindowBackground = YES;
+        self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces |
+                                  NSWindowCollectionBehaviorStationary;
+        self.minSize = NSMakeSize(240, 180);
+
+        CameraPreviewView *preview = [[CameraPreviewView alloc] initWithFrame:rect];
+        self.contentView = preview;
+    }
+    return self;
+}
+
+- (BOOL)canBecomeKeyWindow  { return YES; }
+- (BOOL)canBecomeMainWindow { return NO; }
+
+@end
+
+// ── Stdin reader ──────────────────────────────────────────────────────
+
+@interface StdinReader : NSObject
+@property (nonatomic, strong) CameraWindow *window;
+- (void)startReading;
+- (void)handleCommand:(NSString *)cmd;
+@end
+
+@implementation StdinReader
+
+- (void)startReading {
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        char buf[1024];
+        while (fgets(buf, sizeof(buf), stdin)) {
+            NSString *cmd = [[NSString stringWithUTF8String:buf]
+                stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
+            if (cmd.length == 0) continue;
+            [self performSelectorOnMainThread:@selector(handleCommand:)
+                                   withObject:cmd
+                                waitUntilDone:YES];
+        }
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [(CameraPreviewView *)self.window.contentView stopCamera];
+            [NSApp terminate:nil];
+        });
+    });
+}
+
+- (void)handleCommand:(NSString *)cmd {
+    CameraPreviewView *preview = (CameraPreviewView *)self.window.contentView;
+
+    if ([cmd hasPrefix:@"capture "]) {
+        NSString *path = [cmd substringFromIndex:8];
+        [preview freeze];
+        [NSThread sleepForTimeInterval:0.05];
+        BOOL ok = [preview saveFrameToPath:path];
+        printf("%s\n", ok ? "ok" : "error");
+        fflush(stdout);
+    } else if ([cmd hasPrefix:@"snap "]) {
+        NSString *path = [cmd substringFromIndex:5];
+        BOOL ok = [preview saveFrameToPath:path];
+        printf("%s\n", ok ? "ok" : "error");
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"freeze"]) {
+        [preview freeze];
+        printf("ok\n");
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"unfreeze"]) {
+        [preview unfreeze];
+        printf("ok\n");
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"quit"]) {
+        [preview stopCamera];
+        [NSApp terminate:nil];
+    }
+}
+
+@end
+
+// ── Main ──────────────────────────────────────────────────────────────
+
+int main(int argc, const char *argv[]) {
+    @autoreleasepool {
+        NSApplication *app = [NSApplication sharedApplication];
+        [app setActivationPolicy:NSApplicationActivationPolicyAccessory];
+
+        NSScreen *scr = [NSScreen mainScreen];
+        NSRect sf = scr.frame;
+        CGFloat w = 480, h = 360;
+        CGFloat x = sf.size.width - w - 24;
+        CGFloat y = sf.size.height - h - 60;
+
+        CameraWindow *win = [[CameraWindow alloc]
+            initWithRect:NSMakeRect(x, y, w, h)];
+        [win makeKeyAndOrderFront:nil];
+        [app activateIgnoringOtherApps:YES];
+
+        StdinReader *reader = [[StdinReader alloc] init];
+        reader.window = win;
+        [reader startReading];
+
+        printf("ready\n");
+        fflush(stdout);
+
+        [app run];
+    }
+    return 0;
+}
diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m
index 274a3fc..885cf19 100644
--- a/src/audio/rcli_overlay.m
+++ b/src/audio/rcli_overlay.m
@@ -9,10 +9,10 @@
 
 #import <AppKit/AppKit.h>
 
-static const CGFloat kBorder    = 6.0;
-static const CGFloat kRadius    = 12.0;
-static const CGFloat kHandle    = 18.0;   // corner handle size
-static const CGFloat kEdgeGrab  = 14.0;   // invisible edge grab zone
+static const CGFloat kBorder    = 8.0;
+static const CGFloat kRadius    = 14.0;
+static const CGFloat kHandle    = 28.0;   // corner handle size
+static const CGFloat kEdgeGrab  = 20.0;   // invisible edge grab zone
 
 // ── Custom view: bold border + corner handles + label pill ─────────────
 @interface OverlayView : NSView
@@ -25,23 +25,37 @@ - (void)drawRect:(NSRect)dirtyRect {
     NSRectFill(dirtyRect);
 
     NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder);
-    NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92];
+    NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0];
+
+    // Outer glow — wide, soft, two layers for depth
+    NSBezierPath *glow2 = [NSBezierPath bezierPathWithRoundedRect:inner
+                                                           xRadius:kRadius yRadius:kRadius];
+    [glow2 setLineWidth:kBorder + 16];
+    [[green colorWithAlphaComponent:0.08] set];
+    [glow2 stroke];
 
-    // Outer glow
     NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner
                                                          xRadius:kRadius yRadius:kRadius];
-    [glow setLineWidth:kBorder + 6];
-    [[green colorWithAlphaComponent:0.12] set];
+    [glow setLineWidth:kBorder + 8];
+    [[green colorWithAlphaComponent:0.18] set];
     [glow stroke];
 
-    // Main border — solid, thick, rounded
+    // Main border — bold, solid, rounded
     NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner
                                                            xRadius:kRadius yRadius:kRadius];
     [border setLineWidth:kBorder];
     [green set];
     [border stroke];
 
-    // Corner handles — filled rounded squares with white dot
+    // Inner highlight — thin white line for depth
+    NSRect innerHL = NSInsetRect(inner, 1.5, 1.5);
+    NSBezierPath *highlight = [NSBezierPath bezierPathWithRoundedRect:innerHL
+                                                              xRadius:kRadius - 1.5 yRadius:kRadius - 1.5];
+    [highlight setLineWidth:1.0];
+    [[NSColor colorWithWhite:1.0 alpha:0.15] set];
+    [highlight stroke];
+
+    // Corner handles — large rounded squares with shadow + white center dot
     CGFloat hs = kHandle;
     CGFloat off = kBorder / 2;
     NSRect corners[4] = {
@@ -51,20 +65,50 @@ - (void)drawRect:(NSRect)dirtyRect {
         NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs),
     };
     for (int i = 0; i < 4; i++) {
+        // Drop shadow
+        NSRect shadowRect = NSOffsetRect(corners[i], 0, -1);
+        NSBezierPath *shadow = [NSBezierPath bezierPathWithRoundedRect:shadowRect
+                                                               xRadius:6 yRadius:6];
+        [[NSColor colorWithWhite:0.0 alpha:0.25] set];
+        [shadow fill];
+
+        // Handle body
         NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i]
-                                                          xRadius:4 yRadius:4];
+                                                          xRadius:6 yRadius:6];
         [green set];
         [h fill];
+
+        // White border on handle
+        [h setLineWidth:1.5];
+        [[NSColor colorWithWhite:1.0 alpha:0.4] set];
+        [h stroke];
+
         // White center dot
-        NSRect dot = NSInsetRect(corners[i], 5, 5);
-        [[NSColor colorWithWhite:1.0 alpha:0.85] set];
+        NSRect dot = NSInsetRect(corners[i], hs * 0.3, hs * 0.3);
+        [[NSColor colorWithWhite:1.0 alpha:0.9] set];
         [[NSBezierPath bezierPathWithOvalInRect:dot] fill];
     }
 
+    // Edge midpoint handles — small bars to hint at edge dragging
+    CGFloat eh = 5.0;   // half-thickness
+    CGFloat el = 32.0;  // bar length
+    NSRect edges[4] = {
+        NSMakeRect(NSMidX(inner) - el/2, NSMaxY(inner) - eh/2, el, eh),   // top
+        NSMakeRect(NSMidX(inner) - el/2, NSMinY(inner) - eh/2, el, eh),   // bottom
+        NSMakeRect(NSMinX(inner) - eh/2, NSMidY(inner) - el/2, eh, el),   // left
+        NSMakeRect(NSMaxX(inner) - eh/2, NSMidY(inner) - el/2, eh, el),   // right
+    };
+    for (int i = 0; i < 4; i++) {
+        NSBezierPath *ep = [NSBezierPath bezierPathWithRoundedRect:edges[i]
+                                                           xRadius:2.5 yRadius:2.5];
+        [[green colorWithAlphaComponent:0.7] set];
+        [ep fill];
+    }
+
     // Label pill — centered at top
     NSString *label = @"  RCLI Visual Mode  ";
     NSDictionary *attrs = @{
-        NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold],
+        NSFontAttributeName: [NSFont systemFontOfSize:12 weight:NSFontWeightHeavy],
         NSForegroundColorAttributeName: [NSColor blackColor],
     };
     NSSize sz = [label sizeWithAttributes:attrs];
diff --git a/src/cli/main.cpp b/src/cli/main.cpp
index 58cd4e1..a5c773d 100644
--- a/src/cli/main.cpp
+++ b/src/cli/main.cpp
@@ -486,9 +486,8 @@ static int cmd_vlm(const Args& args) {
     fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset);
     if (rcli_vlm_init(g_engine) != 0) {
         fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
-        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
-        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
-        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        fprintf(stderr, "  No VLM model found. Download one:\n");
+        fprintf(stderr, "  %srcli models vlm%s\n\n", color::bold, color::reset);
         rcli_destroy(g_engine);
         return 1;
     }
@@ -548,9 +547,8 @@ static int cmd_camera(const Args& args) {
 
     if (rcli_vlm_init(g_engine) != 0) {
         fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
-        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
-        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
-        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        fprintf(stderr, "  No VLM model found. Download one:\n");
+        fprintf(stderr, "  %srcli models vlm%s\n\n", color::bold, color::reset);
         rcli_destroy(g_engine);
         return 1;
     }
@@ -618,9 +616,8 @@ static int cmd_screen(const Args& args) {
 
     if (rcli_vlm_init(g_engine) != 0) {
         fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
-        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
-        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
-        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        fprintf(stderr, "  No VLM model found. Download one:\n");
+        fprintf(stderr, "  %srcli models vlm%s\n\n", color::bold, color::reset);
         rcli_destroy(g_engine);
         return 1;
     }
diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h
index 7b01d1e..1f1f651 100644
--- a/src/cli/tui_app.h
+++ b/src/cli/tui_app.h
@@ -14,6 +14,7 @@
 #include "engines/metalrt_loader.h"
 #include "engines/vlm_engine.h"
 #include "audio/camera_capture.h"
+#include "audio/camera_preview.h"
 #include "audio/screen_capture.h"
 #include "models/vlm_model_registry.h"
 #include "core/log.h"
@@ -439,9 +440,43 @@ class TuiApp {
                 if (c == "r" || c == "R") { enter_rag_mode(); return true; }
                 if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; }
                 if (c == "p" || c == "P") { enter_personality_mode(); return true; }
-                // V key: capture photo from camera and analyze with VLM
+                // V key: toggle camera preview mode (live feed + auto VLM analysis)
                 if (c == "v" || c == "V") {
-                    run_camera_vlm("Describe what you see in this photo in detail.");
+                    if (camera_preview_active()) {
+                        add_system_message("Closing camera...");
+                        screen_->Post(Event::Custom);
+                        stop_camera_auto_analysis();
+                        std::thread([this]() {
+                            camera_preview_stop();
+                            rcli_vlm_exit(engine_);
+                            add_system_message("Camera OFF");
+                            screen_->Post(Event::Custom);
+                        }).detach();
+                    } else {
+                        add_system_message("Opening camera, loading VLM...");
+                        screen_->Post(Event::Custom);
+                        std::thread([this]() {
+                            if (rcli_vlm_init(engine_) != 0) {
+                                add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models");
+                                screen_->Post(Event::Custom);
+                                return;
+                            }
+                            if (camera_preview_start() != 0) {
+                                add_system_message("Camera preview failed. Check camera permissions in System Settings > Privacy & Security > Camera.");
+                                screen_->Post(Event::Custom);
+                                return;
+                            }
+                            const char* vbe = rcli_vlm_backend_name(engine_);
+                            const char* vmodel = rcli_vlm_model_name(engine_);
+                            std::string msg = "Camera LIVE";
+                            if (vbe && vbe[0])
+                                msg += std::string(" \xe2\x80\x94 ") + vmodel + " via " + vbe;
+                            msg += ". Auto-analyzing every ~8s. Speak to ask a specific question";
+                            add_system_message(msg);
+                            screen_->Post(Event::Custom);
+                            start_camera_auto_analysis();
+                        }).detach();
+                    }
                     return true;
                 }
                 // S key: toggle visual mode (VLM only on llama.cpp engine)
@@ -581,6 +616,12 @@ class TuiApp {
             std::string user_text = transcript;
             add_user_message(user_text);
 
+            // Camera preview: route voice to camera VLM analysis
+            if (camera_preview_active()) {
+                run_camera_preview_vlm(user_text);
+                return;
+            }
+
             // Visual mode: route voice to VLM screen analysis instead of LLM
             if (screen_capture_overlay_active()) {
                 run_screen_vlm(user_text);
@@ -1117,9 +1158,16 @@ class TuiApp {
         else
             right.push_back(text("[A] actions  ") | dim);
         right.push_back(text("[C] convo  ") | dim);
-        right.push_back(text("[V] camera  ") | dim);
+        if (camera_preview_active()) {
+            if (cam_auto_busy_.load())
+                right.push_back(text("[V] camera \xf0\x9f\x94\xb4  ") | ftxui::color(ftxui::Color::RedLight));
+            else
+                right.push_back(text("[V] camera LIVE  ") | ftxui::color(ftxui::Color::Green));
+        } else {
+            right.push_back(text("[V] camera  ") | dim);
+        }
         if (screen_capture_overlay_active())
-            right.push_back(text("[S] visual ●  ") | ftxui::color(ftxui::Color::Green));
+            right.push_back(text("[S] visual \xe2\x97\x8f  ") | ftxui::color(ftxui::Color::Green));
         else
             right.push_back(text("[S] visual  ") | dim);
         right.push_back(text("[R] RAG  ") | dim);
@@ -2234,6 +2282,118 @@ class TuiApp {
     // process_input
     // ====================================================================
 
+    void start_camera_auto_analysis() {
+        cam_auto_running_.store(true);
+        cam_auto_busy_.store(false);
+        cam_auto_thread_ = std::thread([this]() {
+            // Small initial delay to let the camera warm up
+            for (int i = 0; i < 4 && cam_auto_running_.load(); i++)
+                std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+            while (cam_auto_running_.load()) {
+                if (!camera_preview_active()) break;
+                // Skip if voice/text analysis is in progress, check again in 500ms
+                if (cam_auto_busy_.load() || voice_state_.load() != VoiceState::IDLE) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+                    continue;
+                }
+
+                cam_auto_busy_.store(true);
+                std::string photo_path = "/tmp/rcli_cam_auto_" +
+                    std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+
+                if (camera_preview_snap(photo_path.c_str()) != 0) {
+                    cam_auto_busy_.store(false);
+                    continue;
+                }
+
+                voice_state_ = VoiceState::THINKING;
+                screen_->Post(Event::Custom);
+
+                std::string accumulated;
+                auto stream_cb = [](const char* event, const char* data, void* ud) {
+                    auto* accum = static_cast<std::string*>(ud);
+                    if (std::strcmp(event, "token") == 0)
+                        accum->append(data);
+                };
+                int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(),
+                    "Briefly describe what you see. Focus on what's new or interesting. Be concise (1-2 sentences).",
+                    stream_cb, &accumulated);
+
+                if (vlm_rc == 0 && !accumulated.empty()) {
+                    add_response(accumulated, "VLM \xf0\x9f\x93\xb7");
+                    voice_state_ = VoiceState::SPEAKING;
+                    screen_->Post(Event::Custom);
+                    rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr);
+                }
+
+                voice_state_ = VoiceState::IDLE;
+                cam_auto_busy_.store(false);
+                screen_->Post(Event::Custom);
+
+                // Brief cooldown after analysis before next cycle
+                for (int i = 0; i < 4 && cam_auto_running_.load(); i++)
+                    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+            }
+        });
+    }
+
+    void stop_camera_auto_analysis() {
+        cam_auto_running_.store(false);
+        if (cam_auto_thread_.joinable())
+            cam_auto_thread_.join();
+    }
+
+    // User-initiated camera analysis (voice/text) — pauses auto, runs targeted query
+    void run_camera_preview_vlm(const std::string& prompt) {
+        cam_auto_busy_.store(true);
+        add_system_message("Analyzing camera feed...");
+        voice_state_ = VoiceState::THINKING;
+        std::string prompt_copy = prompt;
+        std::thread([this, prompt_copy]() {
+            std::string photo_path = "/tmp/rcli_cam_" +
+                std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+
+            if (camera_preview_snap(photo_path.c_str()) != 0) {
+                add_response("(Camera capture failed.)", "");
+                voice_state_ = VoiceState::IDLE;
+                cam_auto_busy_.store(false);
+                screen_->Post(Event::Custom);
+                return;
+            }
+
+            std::string accumulated;
+            auto stream_cb = [](const char* event, const char* data, void* ud) {
+                auto* accum = static_cast<std::string*>(ud);
+                if (std::strcmp(event, "token") == 0) {
+                    accum->append(data);
+                }
+            };
+            int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(),
+                                                  prompt_copy.c_str(), stream_cb, &accumulated);
+
+            if (vlm_rc == 0 && !accumulated.empty()) {
+                add_response(accumulated, "VLM");
+                voice_state_ = VoiceState::SPEAKING;
+                screen_->Post(Event::Custom);
+                rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr);
+                RCLIVlmStats stats;
+                if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+                    char buf[128];
+                    snprintf(buf, sizeof(buf), "\xe2\x9a\xa1 %.1f tok/s  |  %d tokens  |  %.1fs total",
+                             stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+                    add_system_message(buf);
+                }
+            } else {
+                add_response("(VLM analysis failed.)", "");
+            }
+
+            voice_state_ = VoiceState::IDLE;
+            cam_auto_busy_.store(false);
+            screen_->Post(Event::Custom);
+        }).detach();
+    }
+
     void run_camera_vlm(const std::string& prompt) {
         add_system_message("Capturing photo from camera...");
         voice_state_ = VoiceState::THINKING;
@@ -2383,6 +2543,9 @@ class TuiApp {
             add_system_message("  P      Personality");
             add_system_message("  R      RAG panel");
             add_system_message("  D      Delete / cleanup models");
+            add_system_message("--- Vision ---");
+            add_system_message("  V      Camera preview (toggle live feed + VLM)");
+            add_system_message("  S      Visual mode (screen overlay + VLM)");
             add_system_message("--- Toggles ---");
             add_system_message("  T      Tool call trace (show tool calls & results)");
 
@@ -2421,7 +2584,10 @@ class TuiApp {
         }
 
         if (cmd == "camera" || cmd == "photo" || cmd == "webcam") {
-            run_camera_vlm("Describe what you see in this photo in detail.");
+            if (camera_preview_active())
+                run_camera_preview_vlm("Describe what you see in this photo in detail.");
+            else
+                run_camera_vlm("Describe what you see in this photo in detail.");
             return;
         }
 
@@ -2607,6 +2773,12 @@ class TuiApp {
             }
         }
 
+        // Camera preview active: route typed questions to camera VLM
+        if (camera_preview_active()) {
+            run_camera_preview_vlm(input);
+            return;
+        }
+
         // Run LLM (or RAG+LLM) in background thread to keep UI responsive
         voice_state_ = VoiceState::THINKING;
         std::string input_copy = input;
@@ -2865,6 +3037,12 @@ class TuiApp {
     ftxui::Color personality_msg_color_;
 
 
+    // Camera auto-analysis state
+    std::atomic<bool> cam_auto_running_{false};
+    std::atomic<bool> cam_auto_busy_{false};
+    std::thread cam_auto_thread_;
+    std::string cam_last_snap_path_;
+
     // RAG panel state
     struct RagOption { std::string name, action; };
     bool rag_mode_ = false;
diff --git a/src/engines/metalrt_vlm_engine.cpp b/src/engines/metalrt_vlm_engine.cpp
new file mode 100644
index 0000000..b1e30ec
--- /dev/null
+++ b/src/engines/metalrt_vlm_engine.cpp
@@ -0,0 +1,256 @@
+#include "engines/metalrt_vlm_engine.h"
+#include "core/log.h"
+#include <chrono>
+#include <mutex>
+
+namespace rastack {
+
+bool MetalRTVlmEngine::init(const MetalRTVlmConfig& config) {
+    auto& loader = MetalRTLoader::instance();
+    if (!loader.is_loaded() && !loader.load()) {
+        LOG_ERROR("MetalRT-VLM", "dylib not loaded");
+        return false;
+    }
+
+    if (!loader.has_vision()) {
+        LOG_WARN("MetalRT-VLM", "Vision symbols not available in dylib — "
+                 "create=%p analyze=%p",
+                 (void*)loader.vision_create, (void*)loader.vision_analyze);
+        return false;
+    }
+
+    LOG_DEBUG("MetalRT-VLM", "Creating VLM instance via Metal GPU...");
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+    handle_ = loader.vision_create();
+    if (!handle_) {
+        LOG_ERROR("MetalRT-VLM", "Failed to create VLM instance");
+        return false;
+    }
+
+    LOG_DEBUG("MetalRT-VLM", "Loading model from %s ...", config.model_dir.c_str());
+    if (!loader.vision_load(handle_, config.model_dir.c_str())) {
+        LOG_ERROR("MetalRT-VLM", "Failed to load model from %s", config.model_dir.c_str());
+        loader.vision_destroy(handle_);
+        handle_ = nullptr;
+        return false;
+    }
+
+    config_ = config;
+
+    auto t_end = std::chrono::high_resolution_clock::now();
+    double init_ms = std::chrono::duration<double, std::milli>(t_end - t_start).count();
+
+    initialized_ = true;
+
+    const char* mname = loader.vision_model_name ? loader.vision_model_name(handle_) : "";
+    const char* dname = loader.vision_device_name ? loader.vision_device_name(handle_) : "";
+
+    LOG_DEBUG("MetalRT-VLM", "=== MetalRT VLM GPU VERIFICATION ===");
+    LOG_DEBUG("MetalRT-VLM", "  Engine:    VLM via libmetalrt.dylib (Metal GPU)");
+    LOG_DEBUG("MetalRT-VLM", "  Model dir: %s", config.model_dir.c_str());
+    LOG_DEBUG("MetalRT-VLM", "  Model:     %s", mname);
+    LOG_DEBUG("MetalRT-VLM", "  Device:    %s", dname);
+    LOG_DEBUG("MetalRT-VLM", "  Init time: %.1f ms", init_ms);
+    return true;
+}
+
+void MetalRTVlmEngine::shutdown() {
+    if (handle_) {
+        auto& loader = MetalRTLoader::instance();
+        if (loader.vision_destroy) {
+            loader.vision_destroy(handle_);
+        }
+        handle_ = nullptr;
+    }
+    initialized_ = false;
+    stats_ = {};
+}
+
+void MetalRTVlmEngine::reset() {
+    if (!initialized_ || !handle_) return;
+    auto& loader = MetalRTLoader::instance();
+    if (loader.vision_reset) {
+        std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
+        loader.vision_reset(handle_);
+    }
+}
+
+std::string MetalRTVlmEngine::analyze_image(const std::string& image_path,
+                                              const std::string& prompt) {
+    if (!initialized_ || !handle_) return "";
+
+    auto& loader = MetalRTLoader::instance();
+
+    LOG_DEBUG("MetalRT-VLM", "analyze_image() → Metal GPU | image=%s prompt=%zu chars",
+              image_path.c_str(), prompt.size());
+
+    MetalRTLoader::MetalRTVisionOptions opts = {};
+    opts.max_tokens = config_.max_tokens;
+    opts.top_k = config_.top_k;
+    opts.temperature = config_.temperature;
+    opts.think = false;
+
+    auto wall_start = std::chrono::high_resolution_clock::now();
+    MetalRTLoader::MetalRTVisionResult result;
+    {
+        std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
+        result = loader.vision_analyze(handle_, image_path.c_str(), prompt.c_str(), &opts);
+    }
+    auto wall_end = std::chrono::high_resolution_clock::now();
+    double wall_ms = std::chrono::duration<double, std::milli>(wall_end - wall_start).count();
+
+    // Store stats
+    stats_.vision_encode_ms = result.vision_encode_ms;
+    stats_.prefill_ms = result.prefill_ms;
+    stats_.decode_ms = result.decode_ms;
+    stats_.tps = result.tps;
+    stats_.prompt_tokens = result.prompt_tokens;
+    stats_.generated_tokens = result.generated_tokens;
+
+    std::string text;
+    if (result.response && result.response[0]) {
+        text = result.response;
+    } else if (result.text && result.text[0]) {
+        text = result.text;
+    }
+
+    LOG_DEBUG("MetalRT-VLM", "=== VLM ANALYSIS [Metal GPU] ===");
+    LOG_DEBUG("MetalRT-VLM", "  Vision encode: %.1f ms", result.vision_encode_ms);
+    LOG_DEBUG("MetalRT-VLM", "  Prefill:       %.1f ms (%d tokens)", result.prefill_ms, result.prompt_tokens);
+    LOG_DEBUG("MetalRT-VLM", "  Decode:        %.1f ms (%d tokens)", result.decode_ms, result.generated_tokens);
+    LOG_DEBUG("MetalRT-VLM", "  TPS:           %.1f tok/s", result.tps);
+    LOG_DEBUG("MetalRT-VLM", "  Wall time:     %.1f ms", wall_ms);
+
+    if (loader.vision_free_result)
+        loader.vision_free_result(result);
+
+    return text;
+}
+
+std::string MetalRTVlmEngine::analyze_image_stream(const std::string& image_path,
+                                                     const std::string& prompt,
+                                                     TokenCallback on_token) {
+    if (!initialized_ || !handle_) return "";
+
+    auto& loader = MetalRTLoader::instance();
+    if (!loader.vision_analyze_stream) {
+        // Fall back to non-streaming
+        return analyze_image(image_path, prompt);
+    }
+
+    LOG_DEBUG("MetalRT-VLM", "analyze_image_stream() → Metal GPU | image=%s", image_path.c_str());
+
+    MetalRTLoader::MetalRTVisionOptions opts = {};
+    opts.max_tokens = config_.max_tokens;
+    opts.top_k = config_.top_k;
+    opts.temperature = config_.temperature;
+    opts.think = false;
+
+    // Bridge TokenCallback to MetalRTStreamCb
+    struct StreamCtx {
+        TokenCallback cb;
+    };
+    StreamCtx ctx{on_token};
+
+    MetalRTStreamCb stream_cb = nullptr;
+    if (on_token) {
+        stream_cb = [](const char* piece, void* user_data) -> bool {
+            auto* sctx = static_cast<StreamCtx*>(user_data);
+            if (sctx->cb) {
+                TokenOutput tok;
+                tok.text = piece;
+                sctx->cb(tok);
+            }
+            return true;  // continue generation
+        };
+    }
+
+    auto wall_start = std::chrono::high_resolution_clock::now();
+    MetalRTLoader::MetalRTVisionResult result;
+    {
+        std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
+        result = loader.vision_analyze_stream(handle_, image_path.c_str(), prompt.c_str(),
+                                               stream_cb, &ctx, &opts);
+    }
+    auto wall_end = std::chrono::high_resolution_clock::now();
+    double wall_ms = std::chrono::duration<double, std::milli>(wall_end - wall_start).count();
+
+    stats_.vision_encode_ms = result.vision_encode_ms;
+    stats_.prefill_ms = result.prefill_ms;
+    stats_.decode_ms = result.decode_ms;
+    stats_.tps = result.tps;
+    stats_.prompt_tokens = result.prompt_tokens;
+    stats_.generated_tokens = result.generated_tokens;
+
+    std::string text;
+    if (result.response && result.response[0]) {
+        text = result.response;
+    } else if (result.text && result.text[0]) {
+        text = result.text;
+    }
+
+    LOG_DEBUG("MetalRT-VLM", "  Stream complete: %.1f ms, %d tokens, %.1f tok/s",
+              wall_ms, result.generated_tokens, result.tps);
+
+    if (loader.vision_free_result)
+        loader.vision_free_result(result);
+
+    return text;
+}
+
+std::string MetalRTVlmEngine::generate(const std::string& prompt) {
+    if (!initialized_ || !handle_) return "";
+
+    auto& loader = MetalRTLoader::instance();
+    if (!loader.vision_generate) return "";
+
+    MetalRTLoader::MetalRTVisionOptions opts = {};
+    opts.max_tokens = config_.max_tokens;
+    opts.top_k = config_.top_k;
+    opts.temperature = config_.temperature;
+    opts.think = false;
+
+    MetalRTLoader::MetalRTVisionResult result;
+    {
+        std::lock_guard<std::mutex> gpu_lock(loader.gpu_mutex());
+        result = loader.vision_generate(handle_, prompt.c_str(), &opts);
+    }
+
+    stats_.vision_encode_ms = result.vision_encode_ms;
+    stats_.prefill_ms = result.prefill_ms;
+    stats_.decode_ms = result.decode_ms;
+    stats_.tps = result.tps;
+    stats_.prompt_tokens = result.prompt_tokens;
+    stats_.generated_tokens = result.generated_tokens;
+
+    std::string text;
+    if (result.response && result.response[0]) {
+        text = result.response;
+    } else if (result.text && result.text[0]) {
+        text = result.text;
+    }
+
+    if (loader.vision_free_result)
+        loader.vision_free_result(result);
+
+    return text;
+}
+
+std::string MetalRTVlmEngine::model_name() const {
+    if (!initialized_ || !handle_) return "";
+    auto& loader = MetalRTLoader::instance();
+    if (!loader.vision_model_name) return "";
+    const char* name = loader.vision_model_name(handle_);
+    return name ? name : "";
+}
+
+std::string MetalRTVlmEngine::device_name() const {
+    if (!initialized_ || !handle_) return "";
+    auto& loader = MetalRTLoader::instance();
+    if (!loader.vision_device_name) return "";
+    const char* name = loader.vision_device_name(handle_);
+    return name ? name : "";
+}
+
+} // namespace rastack
\ No newline at end of file
diff --git a/src/engines/metalrt_vlm_engine.h b/src/engines/metalrt_vlm_engine.h
new file mode 100644
index 0000000..1dee0cb
--- /dev/null
+++ b/src/engines/metalrt_vlm_engine.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "engines/metalrt_loader.h"
+#include "core/types.h"
+#include <string>
+#include <functional>
+#include <atomic>
+
+namespace rastack {
+
+struct MetalRTVlmConfig {
+    std::string model_dir;
+    int max_tokens = 512;
+    int top_k = 40;
+    float temperature = 0.7f;
+};
+
+struct MetalRTVlmStats {
+    double vision_encode_ms = 0;
+    double prefill_ms = 0;
+    double decode_ms = 0;
+    double tps = 0;
+    int prompt_tokens = 0;
+    int generated_tokens = 0;
+};
+
+class MetalRTVlmEngine {
+public:
+    MetalRTVlmEngine() = default;
+    ~MetalRTVlmEngine() { shutdown(); }
+
+    MetalRTVlmEngine(const MetalRTVlmEngine&) = delete;
+    MetalRTVlmEngine& operator=(const MetalRTVlmEngine&) = delete;
+
+    bool init(const MetalRTVlmConfig& config);
+    void shutdown();
+    void reset();
+
+    // Analyze an image with a text prompt (blocking)
+    std::string analyze_image(const std::string& image_path,
+                              const std::string& prompt);
+
+    // Analyze with streaming token callback
+    std::string analyze_image_stream(const std::string& image_path,
+                                     const std::string& prompt,
+                                     TokenCallback on_token);
+
+    // Text-only generation (follow-up without new image)
+    std::string generate(const std::string& prompt);
+
+    std::string model_name() const;
+    std::string device_name() const;
+
+    bool is_initialized() const { return initialized_; }
+    const MetalRTVlmStats& last_stats() const { return stats_; }
+
+private:
+    void* handle_ = nullptr;
+    MetalRTVlmConfig config_;
+    MetalRTVlmStats stats_;
+    bool initialized_ = false;
+};
+
+} // namespace rastack
\ No newline at end of file