From 02faedd8a083506e7ca5c0a17611969050b725f2 Mon Sep 17 00:00:00 2001 From: AmanSwar Date: Mon, 16 Mar 2026 01:08:32 +0530 Subject: [PATCH 1/3] feat: add MetalRT VLM backend for vision-language models When running on MetalRT engine, VLM commands (vlm, camera, screen) now use MetalRT's native vision pipeline instead of requiring llama.cpp. Falls back to llama.cpp gracefully if MetalRT VLM model not available. --- CMakeLists.txt | 1 + src/api/rcli_api.cpp | 201 +++++++++++++++++----- src/cli/main.cpp | 15 +- src/engines/metalrt_vlm_engine.cpp | 256 +++++++++++++++++++++++++++++ src/engines/metalrt_vlm_engine.h | 64 ++++++++ 5 files changed, 483 insertions(+), 54 deletions(-) create mode 100644 src/engines/metalrt_vlm_engine.cpp create mode 100644 src/engines/metalrt_vlm_engine.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2719674..c356701 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,7 @@ add_library(rcli STATIC src/engines/metalrt_engine.cpp src/engines/metalrt_stt_engine.cpp src/engines/metalrt_tts_engine.cpp + src/engines/metalrt_vlm_engine.cpp src/engines/vlm_engine.cpp src/audio/audio_io.cpp src/audio/mic_permission.mm diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp index f292c78..aaf2d1d 100644 --- a/src/api/rcli_api.cpp +++ b/src/api/rcli_api.cpp @@ -41,6 +41,7 @@ extern char** environ; #include "actions/action_registry.h" #include "actions/macos_actions.h" #include "engines/vlm_engine.h" +#include "engines/metalrt_vlm_engine.h" #include "models/vlm_model_registry.h" using namespace rastack; @@ -117,10 +118,12 @@ struct RCLIEngine { int ctx_main_prompt_tokens = 0; // VLM (Vision Language Model) subsystem - VlmEngine vlm_engine; + VlmEngine vlm_engine; // llama.cpp backend + MetalRTVlmEngine metalrt_vlm_engine; // MetalRT backend bool vlm_initialized = false; + bool vlm_use_metalrt = false; // which backend is active std::string last_vlm_response; - std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT" + std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT (Metal GPU)" std::string vlm_model_name; // e.g. "Qwen3 VL 2B" std::mutex mutex; @@ -1065,8 +1068,7 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u if (!engine->vlm_initialized) { if (vlm_init_locked(engine) != 0) { return "I can see you're asking about your screen, but VLM isn't available. " - "It requires the llama.cpp engine and a VLM model. " - "Switch with: rcli engine llamacpp, then download a model: rcli models vlm"; + "Download a VLM model with: rcli models vlm"; } } @@ -1076,7 +1078,12 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u vlm_prompt = "Describe what you see on this screen in detail."; } - std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr); + std::string result; + if (engine->vlm_use_metalrt) { + result = engine->metalrt_vlm_engine.analyze_image(path, vlm_prompt); + } else { + result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr); + } if (result.empty()) { return "I captured your screen but the analysis failed. Please try again."; @@ -2983,8 +2990,47 @@ static bool safe_download(const std::string& url, const std::string& dest) { return WIFEXITED(status) && WEXITSTATUS(status) == 0; } +// Find a MetalRT VLM model directory (MLX-format weights). +// Searches HuggingFace cache for known models. +static std::string find_metalrt_vlm_model_dir() { + const char* home = getenv("HOME"); + if (!home) return ""; + + static const char* hf_repos[] = { + "models--mlx-community--Qwen3-VL-2B-Instruct-4bit", + "models--mlx-community--LFM2.5-VL-1.6B-MLX-6bit", + }; + + struct stat st; + std::string hf_base = std::string(home) + "/.cache/huggingface/hub"; + + for (const char* repo : hf_repos) { + std::string snapshots_dir = hf_base + "/" + repo + "/snapshots"; + if (stat(snapshots_dir.c_str(), &st) != 0) continue; + + FILE* p = popen(("ls -1t '" + snapshots_dir + "' 2>/dev/null | head -1").c_str(), "r"); + if (!p) continue; + char buf[256]; + if (!fgets(buf, sizeof(buf), p)) { pclose(p); continue; } + pclose(p); + + std::string snap(buf); + while (!snap.empty() && (snap.back() == '\n' || snap.back() == '\r')) + snap.pop_back(); + if (snap.empty()) continue; + + std::string model_dir = snapshots_dir + "/" + snap; + std::string safetensors = model_dir + "/model.safetensors"; + if (stat(safetensors.c_str(), &st) == 0) { + LOG_DEBUG("VLM", "Found MetalRT VLM model at %s", model_dir.c_str()); + return model_dir; + } + } + + return ""; +} + // Internal init (caller must hold engine->mutex) -// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon. static int vlm_init_locked(RCLIEngine* engine) { if (engine->vlm_initialized) return 0; @@ -2995,13 +3041,34 @@ static int vlm_init_locked(RCLIEngine* engine) { engine->models_dir = "./models"; } - // VLM requires the llama.cpp engine + // --- Try MetalRT VLM backend first (when on MetalRT engine) --- if (engine->initialized && engine->pipeline.using_metalrt()) { - LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp"); - return -1; + auto& loader = MetalRTLoader::instance(); + if (loader.is_loaded() && loader.has_vision()) { + std::string model_dir = find_metalrt_vlm_model_dir(); + if (!model_dir.empty()) { + MetalRTVlmConfig mrt_config; + mrt_config.model_dir = model_dir; + if (engine->metalrt_vlm_engine.init(mrt_config)) { + engine->vlm_initialized = true; + engine->vlm_use_metalrt = true; + engine->vlm_backend_name = "MetalRT (Metal GPU)"; + engine->vlm_model_name = engine->metalrt_vlm_engine.model_name(); + if (engine->vlm_model_name.empty()) + engine->vlm_model_name = "Qwen3 VL 2B"; + LOG_INFO("VLM", "VLM engine ready — %s via MetalRT (Metal GPU)", + engine->vlm_model_name.c_str()); + return 0; + } + LOG_WARN("VLM", "MetalRT VLM init failed, falling back to llama.cpp"); + } else { + LOG_WARN("VLM", "No MetalRT VLM model found in HF cache, falling back to llama.cpp"); + } + } + // Fall through to llama.cpp instead of hard-failing } - // Check if any VLM model is installed (on-demand, no auto-download) + // --- llama.cpp VLM backend --- auto vlm_models = rcli::all_vlm_models(); rcli::VlmModelDef model_def; bool found = false; @@ -3019,7 +3086,6 @@ static int vlm_init_locked(RCLIEngine* engine) { return -1; } - // Initialize VLM engine with the installed model VlmConfig config; config.model_path = engine->models_dir + "/" + model_def.model_filename; config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename; @@ -3036,6 +3102,7 @@ static int vlm_init_locked(RCLIEngine* engine) { } engine->vlm_initialized = true; + engine->vlm_use_metalrt = false; engine->vlm_backend_name = "llama.cpp (Metal GPU)"; engine->vlm_model_name = model_def.name; LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str()); @@ -3056,7 +3123,7 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch if (!engine->vlm_initialized) { if (vlm_init_locked(engine) != 0) { - engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm)."; + engine->last_vlm_response = "VLM not available. Download a VLM model with: rcli models vlm"; return engine->last_vlm_response.c_str(); } } @@ -3065,16 +3132,17 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch ? std::string(prompt) : "Describe this image in detail."; - { - std::string result = engine->vlm_engine.analyze_image( + std::string result; + if (engine->vlm_use_metalrt) { + result = engine->metalrt_vlm_engine.analyze_image( + std::string(image_path), text_prompt); + } else { + result = engine->vlm_engine.analyze_image( std::string(image_path), text_prompt, nullptr); - - if (result.empty()) { - engine->last_vlm_response = "Error: Failed to analyze image."; - } else { - engine->last_vlm_response = result; - } } + + engine->last_vlm_response = result.empty() + ? "Error: Failed to analyze image." : result; return engine->last_vlm_response.c_str(); } @@ -3101,12 +3169,21 @@ int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) { auto* engine = static_cast(handle); if (!engine->vlm_initialized) return -1; - auto& s = engine->vlm_engine.last_stats(); - out_stats->gen_tok_per_sec = s.gen_tps(); - out_stats->generated_tokens = static_cast(s.generated_tokens); - out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6; - out_stats->image_encode_ms = s.image_encode_us / 1000.0; - out_stats->first_token_ms = s.first_token_us / 1000.0; + if (engine->vlm_use_metalrt) { + auto& s = engine->metalrt_vlm_engine.last_stats(); + out_stats->gen_tok_per_sec = s.tps; + out_stats->generated_tokens = s.generated_tokens; + out_stats->total_time_sec = (s.vision_encode_ms + s.prefill_ms + s.decode_ms) / 1000.0; + out_stats->image_encode_ms = s.vision_encode_ms; + out_stats->first_token_ms = s.prefill_ms; + } else { + auto& s = engine->vlm_engine.last_stats(); + out_stats->gen_tok_per_sec = s.gen_tps(); + out_stats->generated_tokens = static_cast(s.generated_tokens); + out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6; + out_stats->image_encode_ms = s.image_encode_us / 1000.0; + out_stats->first_token_ms = s.first_token_us / 1000.0; + } return 0; } @@ -3128,11 +3205,16 @@ int rcli_vlm_exit(RCLIHandle handle) { auto* engine = static_cast(handle); std::lock_guard lock(engine->mutex); - if (engine->vlm_engine.is_initialized()) { - engine->vlm_engine.shutdown(); + if (engine->vlm_use_metalrt) { + if (engine->metalrt_vlm_engine.is_initialized()) + engine->metalrt_vlm_engine.shutdown(); + } else { + if (engine->vlm_engine.is_initialized()) + engine->vlm_engine.shutdown(); } engine->vlm_initialized = false; + engine->vlm_use_metalrt = false; engine->vlm_backend_name.clear(); engine->vlm_model_name.clear(); LOG_INFO("VLM", "VLM unloaded"); @@ -3157,29 +3239,58 @@ int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path, std::string text_prompt = (prompt && prompt[0]) ? std::string(prompt) : "Describe this image in detail."; - // llama.cpp VLM streaming path - rastack::TokenCallback token_cb = nullptr; - if (callback) { - token_cb = [callback, user_data](const rastack::TokenOutput& tok) { - if (!tok.text.empty()) { - callback("token", tok.text.c_str(), user_data); - } - }; - } + std::string result; + + if (engine->vlm_use_metalrt) { + // MetalRT VLM streaming path + rastack::TokenCallback token_cb = nullptr; + if (callback) { + token_cb = [callback, user_data](const rastack::TokenOutput& tok) { + if (!tok.text.empty()) { + callback("token", tok.text.c_str(), user_data); + } + }; + } - std::string result = engine->vlm_engine.analyze_image( - std::string(image_path), text_prompt, token_cb); + result = engine->metalrt_vlm_engine.analyze_image_stream( + std::string(image_path), text_prompt, token_cb); + + if (callback) { + auto& s = engine->metalrt_vlm_engine.last_stats(); + char stats_buf[256]; + snprintf(stats_buf, sizeof(stats_buf), + "{\"tps\":%.1f,\"tokens\":%d,\"vision_encode_ms\":%.1f}", + s.tps, s.generated_tokens, s.vision_encode_ms); + callback("stats", stats_buf, user_data); + } + } else { + // llama.cpp VLM streaming path + rastack::TokenCallback token_cb = nullptr; + if (callback) { + token_cb = [callback, user_data](const rastack::TokenOutput& tok) { + if (!tok.text.empty()) { + callback("token", tok.text.c_str(), user_data); + } + }; + } + + result = engine->vlm_engine.analyze_image( + std::string(image_path), text_prompt, token_cb); + + if (callback) { + auto& s = engine->vlm_engine.last_stats(); + char stats_buf[256]; + snprintf(stats_buf, sizeof(stats_buf), + "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}", + s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0); + callback("stats", stats_buf, user_data); + } + } engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result; if (callback) { callback("response", engine->last_vlm_response.c_str(), user_data); - auto& s = engine->vlm_engine.last_stats(); - char stats_buf[256]; - snprintf(stats_buf, sizeof(stats_buf), - "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}", - s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0); - callback("stats", stats_buf, user_data); } return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0; diff --git a/src/cli/main.cpp b/src/cli/main.cpp index 58cd4e1..a5c773d 100644 --- a/src/cli/main.cpp +++ b/src/cli/main.cpp @@ -486,9 +486,8 @@ static int cmd_vlm(const Args& args) { fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset); if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } @@ -548,9 +547,8 @@ static int cmd_camera(const Args& args) { if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } @@ -618,9 +616,8 @@ static int cmd_screen(const Args& args) { if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } diff --git a/src/engines/metalrt_vlm_engine.cpp b/src/engines/metalrt_vlm_engine.cpp new file mode 100644 index 0000000..b1e30ec --- /dev/null +++ b/src/engines/metalrt_vlm_engine.cpp @@ -0,0 +1,256 @@ +#include "engines/metalrt_vlm_engine.h" +#include "core/log.h" +#include +#include + +namespace rastack { + +bool MetalRTVlmEngine::init(const MetalRTVlmConfig& config) { + auto& loader = MetalRTLoader::instance(); + if (!loader.is_loaded() && !loader.load()) { + LOG_ERROR("MetalRT-VLM", "dylib not loaded"); + return false; + } + + if (!loader.has_vision()) { + LOG_WARN("MetalRT-VLM", "Vision symbols not available in dylib — " + "create=%p analyze=%p", + (void*)loader.vision_create, (void*)loader.vision_analyze); + return false; + } + + LOG_DEBUG("MetalRT-VLM", "Creating VLM instance via Metal GPU..."); + auto t_start = std::chrono::high_resolution_clock::now(); + + handle_ = loader.vision_create(); + if (!handle_) { + LOG_ERROR("MetalRT-VLM", "Failed to create VLM instance"); + return false; + } + + LOG_DEBUG("MetalRT-VLM", "Loading model from %s ...", config.model_dir.c_str()); + if (!loader.vision_load(handle_, config.model_dir.c_str())) { + LOG_ERROR("MetalRT-VLM", "Failed to load model from %s", config.model_dir.c_str()); + loader.vision_destroy(handle_); + handle_ = nullptr; + return false; + } + + config_ = config; + + auto t_end = std::chrono::high_resolution_clock::now(); + double init_ms = std::chrono::duration(t_end - t_start).count(); + + initialized_ = true; + + const char* mname = loader.vision_model_name ? loader.vision_model_name(handle_) : ""; + const char* dname = loader.vision_device_name ? loader.vision_device_name(handle_) : ""; + + LOG_DEBUG("MetalRT-VLM", "=== MetalRT VLM GPU VERIFICATION ==="); + LOG_DEBUG("MetalRT-VLM", " Engine: VLM via libmetalrt.dylib (Metal GPU)"); + LOG_DEBUG("MetalRT-VLM", " Model dir: %s", config.model_dir.c_str()); + LOG_DEBUG("MetalRT-VLM", " Model: %s", mname); + LOG_DEBUG("MetalRT-VLM", " Device: %s", dname); + LOG_DEBUG("MetalRT-VLM", " Init time: %.1f ms", init_ms); + return true; +} + +void MetalRTVlmEngine::shutdown() { + if (handle_) { + auto& loader = MetalRTLoader::instance(); + if (loader.vision_destroy) { + loader.vision_destroy(handle_); + } + handle_ = nullptr; + } + initialized_ = false; + stats_ = {}; +} + +void MetalRTVlmEngine::reset() { + if (!initialized_ || !handle_) return; + auto& loader = MetalRTLoader::instance(); + if (loader.vision_reset) { + std::lock_guard gpu_lock(loader.gpu_mutex()); + loader.vision_reset(handle_); + } +} + +std::string MetalRTVlmEngine::analyze_image(const std::string& image_path, + const std::string& prompt) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + + LOG_DEBUG("MetalRT-VLM", "analyze_image() → Metal GPU | image=%s prompt=%zu chars", + image_path.c_str(), prompt.size()); + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + auto wall_start = std::chrono::high_resolution_clock::now(); + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_analyze(handle_, image_path.c_str(), prompt.c_str(), &opts); + } + auto wall_end = std::chrono::high_resolution_clock::now(); + double wall_ms = std::chrono::duration(wall_end - wall_start).count(); + + // Store stats + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + LOG_DEBUG("MetalRT-VLM", "=== VLM ANALYSIS [Metal GPU] ==="); + LOG_DEBUG("MetalRT-VLM", " Vision encode: %.1f ms", result.vision_encode_ms); + LOG_DEBUG("MetalRT-VLM", " Prefill: %.1f ms (%d tokens)", result.prefill_ms, result.prompt_tokens); + LOG_DEBUG("MetalRT-VLM", " Decode: %.1f ms (%d tokens)", result.decode_ms, result.generated_tokens); + LOG_DEBUG("MetalRT-VLM", " TPS: %.1f tok/s", result.tps); + LOG_DEBUG("MetalRT-VLM", " Wall time: %.1f ms", wall_ms); + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::analyze_image_stream(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_analyze_stream) { + // Fall back to non-streaming + return analyze_image(image_path, prompt); + } + + LOG_DEBUG("MetalRT-VLM", "analyze_image_stream() → Metal GPU | image=%s", image_path.c_str()); + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + // Bridge TokenCallback to MetalRTStreamCb + struct StreamCtx { + TokenCallback cb; + }; + StreamCtx ctx{on_token}; + + MetalRTStreamCb stream_cb = nullptr; + if (on_token) { + stream_cb = [](const char* piece, void* user_data) -> bool { + auto* sctx = static_cast(user_data); + if (sctx->cb) { + TokenOutput tok; + tok.text = piece; + sctx->cb(tok); + } + return true; // continue generation + }; + } + + auto wall_start = std::chrono::high_resolution_clock::now(); + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_analyze_stream(handle_, image_path.c_str(), prompt.c_str(), + stream_cb, &ctx, &opts); + } + auto wall_end = std::chrono::high_resolution_clock::now(); + double wall_ms = std::chrono::duration(wall_end - wall_start).count(); + + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + LOG_DEBUG("MetalRT-VLM", " Stream complete: %.1f ms, %d tokens, %.1f tok/s", + wall_ms, result.generated_tokens, result.tps); + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::generate(const std::string& prompt) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_generate) return ""; + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_generate(handle_, prompt.c_str(), &opts); + } + + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::model_name() const { + if (!initialized_ || !handle_) return ""; + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_model_name) return ""; + const char* name = loader.vision_model_name(handle_); + return name ? name : ""; +} + +std::string MetalRTVlmEngine::device_name() const { + if (!initialized_ || !handle_) return ""; + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_device_name) return ""; + const char* name = loader.vision_device_name(handle_); + return name ? name : ""; +} + +} // namespace rastack \ No newline at end of file diff --git a/src/engines/metalrt_vlm_engine.h b/src/engines/metalrt_vlm_engine.h new file mode 100644 index 0000000..1dee0cb --- /dev/null +++ b/src/engines/metalrt_vlm_engine.h @@ -0,0 +1,64 @@ +#pragma once + +#include "engines/metalrt_loader.h" +#include "core/types.h" +#include +#include +#include + +namespace rastack { + +struct MetalRTVlmConfig { + std::string model_dir; + int max_tokens = 512; + int top_k = 40; + float temperature = 0.7f; +}; + +struct MetalRTVlmStats { + double vision_encode_ms = 0; + double prefill_ms = 0; + double decode_ms = 0; + double tps = 0; + int prompt_tokens = 0; + int generated_tokens = 0; +}; + +class MetalRTVlmEngine { +public: + MetalRTVlmEngine() = default; + ~MetalRTVlmEngine() { shutdown(); } + + MetalRTVlmEngine(const MetalRTVlmEngine&) = delete; + MetalRTVlmEngine& operator=(const MetalRTVlmEngine&) = delete; + + bool init(const MetalRTVlmConfig& config); + void shutdown(); + void reset(); + + // Analyze an image with a text prompt (blocking) + std::string analyze_image(const std::string& image_path, + const std::string& prompt); + + // Analyze with streaming token callback + std::string analyze_image_stream(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token); + + // Text-only generation (follow-up without new image) + std::string generate(const std::string& prompt); + + std::string model_name() const; + std::string device_name() const; + + bool is_initialized() const { return initialized_; } + const MetalRTVlmStats& last_stats() const { return stats_; } + +private: + void* handle_ = nullptr; + MetalRTVlmConfig config_; + MetalRTVlmStats stats_; + bool initialized_ = false; +}; + +} // namespace rastack \ No newline at end of file From a9d3f5f7de40778aeb14e678c138bc73338e75f1 Mon Sep 17 00:00:00 2001 From: AmanSwar Date: Mon, 16 Mar 2026 01:08:46 +0530 Subject: [PATCH 2/3] ui: make screen capture overlay bolder and easier to interact with Thicker border (8px), larger corner handles (28px), wider edge grab zones (20px), added edge midpoint handles, double-layer outer glow, and heavier label font for better visibility and usability. --- src/audio/rcli_overlay.m | 72 ++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m index 274a3fc..885cf19 100644 --- a/src/audio/rcli_overlay.m +++ b/src/audio/rcli_overlay.m @@ -9,10 +9,10 @@ #import -static const CGFloat kBorder = 6.0; -static const CGFloat kRadius = 12.0; -static const CGFloat kHandle = 18.0; // corner handle size -static const CGFloat kEdgeGrab = 14.0; // invisible edge grab zone +static const CGFloat kBorder = 8.0; +static const CGFloat kRadius = 14.0; +static const CGFloat kHandle = 28.0; // corner handle size +static const CGFloat kEdgeGrab = 20.0; // invisible edge grab zone // ── Custom view: bold border + corner handles + label pill ───────────── @interface OverlayView : NSView @@ -25,23 +25,37 @@ - (void)drawRect:(NSRect)dirtyRect { NSRectFill(dirtyRect); NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder); - NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92]; + NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + + // Outer glow — wide, soft, two layers for depth + NSBezierPath *glow2 = [NSBezierPath bezierPathWithRoundedRect:inner + xRadius:kRadius yRadius:kRadius]; + [glow2 setLineWidth:kBorder + 16]; + [[green colorWithAlphaComponent:0.08] set]; + [glow2 stroke]; - // Outer glow NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner xRadius:kRadius yRadius:kRadius]; - [glow setLineWidth:kBorder + 6]; - [[green colorWithAlphaComponent:0.12] set]; + [glow setLineWidth:kBorder + 8]; + [[green colorWithAlphaComponent:0.18] set]; [glow stroke]; - // Main border — solid, thick, rounded + // Main border — bold, solid, rounded NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner xRadius:kRadius yRadius:kRadius]; [border setLineWidth:kBorder]; [green set]; [border stroke]; - // Corner handles — filled rounded squares with white dot + // Inner highlight — thin white line for depth + NSRect innerHL = NSInsetRect(inner, 1.5, 1.5); + NSBezierPath *highlight = [NSBezierPath bezierPathWithRoundedRect:innerHL + xRadius:kRadius - 1.5 yRadius:kRadius - 1.5]; + [highlight setLineWidth:1.0]; + [[NSColor colorWithWhite:1.0 alpha:0.15] set]; + [highlight stroke]; + + // Corner handles — large rounded squares with shadow + white center dot CGFloat hs = kHandle; CGFloat off = kBorder / 2; NSRect corners[4] = { @@ -51,20 +65,50 @@ - (void)drawRect:(NSRect)dirtyRect { NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs), }; for (int i = 0; i < 4; i++) { + // Drop shadow + NSRect shadowRect = NSOffsetRect(corners[i], 0, -1); + NSBezierPath *shadow = [NSBezierPath bezierPathWithRoundedRect:shadowRect + xRadius:6 yRadius:6]; + [[NSColor colorWithWhite:0.0 alpha:0.25] set]; + [shadow fill]; + + // Handle body NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i] - xRadius:4 yRadius:4]; + xRadius:6 yRadius:6]; [green set]; [h fill]; + + // White border on handle + [h setLineWidth:1.5]; + [[NSColor colorWithWhite:1.0 alpha:0.4] set]; + [h stroke]; + // White center dot - NSRect dot = NSInsetRect(corners[i], 5, 5); - [[NSColor colorWithWhite:1.0 alpha:0.85] set]; + NSRect dot = NSInsetRect(corners[i], hs * 0.3, hs * 0.3); + [[NSColor colorWithWhite:1.0 alpha:0.9] set]; [[NSBezierPath bezierPathWithOvalInRect:dot] fill]; } + // Edge midpoint handles — small bars to hint at edge dragging + CGFloat eh = 5.0; // half-thickness + CGFloat el = 32.0; // bar length + NSRect edges[4] = { + NSMakeRect(NSMidX(inner) - el/2, NSMaxY(inner) - eh/2, el, eh), // top + NSMakeRect(NSMidX(inner) - el/2, NSMinY(inner) - eh/2, el, eh), // bottom + NSMakeRect(NSMinX(inner) - eh/2, NSMidY(inner) - el/2, eh, el), // left + NSMakeRect(NSMaxX(inner) - eh/2, NSMidY(inner) - el/2, eh, el), // right + }; + for (int i = 0; i < 4; i++) { + NSBezierPath *ep = [NSBezierPath bezierPathWithRoundedRect:edges[i] + xRadius:2.5 yRadius:2.5]; + [[green colorWithAlphaComponent:0.7] set]; + [ep fill]; + } + // Label pill — centered at top NSString *label = @" RCLI Visual Mode "; NSDictionary *attrs = @{ - NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold], + NSFontAttributeName: [NSFont systemFontOfSize:12 weight:NSFontWeightHeavy], NSForegroundColorAttributeName: [NSColor blackColor], }; NSSize sz = [label sizeWithAttributes:attrs]; From 66a38b7fd20ccf9683bac7ef01c7ea24359c1f05 Mon Sep 17 00:00:00 2001 From: Shubham Malhotra Date: Sun, 15 Mar 2026 19:29:56 -0700 Subject: [PATCH 3/3] improving the v mode --- CMakeLists.txt | 29 ++- Formula/rcli.rb | 1 + install.sh | 1 + scripts/package.sh | 7 + src/audio/camera_preview.h | 36 ++++ src/audio/camera_preview.mm | 124 ++++++++++++ src/audio/rcli_camera_preview.m | 337 ++++++++++++++++++++++++++++++++ src/cli/tui_app.h | 188 +++++++++++++++++- 8 files changed, 717 insertions(+), 6 deletions(-) create mode 100644 src/audio/camera_preview.h create mode 100644 src/audio/camera_preview.mm create mode 100644 src/audio/rcli_camera_preview.m diff --git a/CMakeLists.txt b/CMakeLists.txt index c356701..2b0c50d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ add_library(rcli STATIC src/audio/audio_io.cpp src/audio/mic_permission.mm src/audio/camera_capture.mm + src/audio/camera_preview.mm src/audio/screen_capture.mm src/pipeline/orchestrator.cpp src/pipeline/sentence_detector.cpp @@ -141,7 +142,7 @@ add_library(rcli STATIC src/api/rcli_api.cpp ) -set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm +set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/camera_preview.mm src/audio/screen_capture.mm PROPERTIES LANGUAGE CXX) target_include_directories(rcli PUBLIC @@ -222,6 +223,32 @@ set_target_properties(rcli_overlay PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) +# ============================================================================= +# rcli_camera_preview — standalone Cocoa helper for live camera preview window +# ============================================================================= +add_executable(rcli_camera_preview + src/audio/rcli_camera_preview.m +) + +set_source_files_properties(src/audio/rcli_camera_preview.m PROPERTIES LANGUAGE CXX) + +target_compile_options(rcli_camera_preview PRIVATE -x objective-c++) + +target_link_libraries(rcli_camera_preview PRIVATE + "-framework AppKit" + "-framework AVFoundation" + "-framework CoreMedia" + "-framework CoreVideo" + "-framework CoreImage" + "-framework ImageIO" + "-framework UniformTypeIdentifiers" +) + +set_target_properties(rcli_camera_preview PROPERTIES + OUTPUT_NAME "rcli_camera_preview" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) + # ============================================================================= # rcli_test — test executable # ============================================================================= diff --git a/Formula/rcli.rb b/Formula/rcli.rb index 7937734..aad98ef 100644 --- a/Formula/rcli.rb +++ b/Formula/rcli.rb @@ -12,6 +12,7 @@ class Rcli < Formula def install bin.install "bin/rcli" bin.install "bin/rcli_overlay" if File.exist? "bin/rcli_overlay" + bin.install "bin/rcli_camera_preview" if File.exist? "bin/rcli_camera_preview" lib.install Dir["lib/*.dylib"] end diff --git a/install.sh b/install.sh index bd5880b..12ca047 100755 --- a/install.sh +++ b/install.sh @@ -70,6 +70,7 @@ else mkdir -p "$CELLAR/bin" "$CELLAR/lib" 2>/dev/null || sudo mkdir -p "$CELLAR/bin" "$CELLAR/lib" cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/" cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || true + cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || true cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/" brew link --overwrite "$FORMULA" 2>/dev/null || sudo brew link --overwrite "$FORMULA" diff --git a/scripts/package.sh b/scripts/package.sh index e238737..1f29c9f 100755 --- a/scripts/package.sh +++ b/scripts/package.sh @@ -27,6 +27,10 @@ if [ -f "$BUILD_DIR/rcli_overlay" ]; then cp "$BUILD_DIR/rcli_overlay" "$DIST_DIR/bin/rcli_overlay" echo " + bin/rcli_overlay" fi +if [ -f "$BUILD_DIR/rcli_camera_preview" ]; then + cp "$BUILD_DIR/rcli_camera_preview" "$DIST_DIR/bin/rcli_camera_preview" + echo " + bin/rcli_camera_preview" +fi # --- Collect dylibs --- DYLIBS=( @@ -155,6 +159,9 @@ codesign --force --sign - "$BINARY" if [ -f "$DIST_DIR/bin/rcli_overlay" ]; then codesign --force --sign - "$DIST_DIR/bin/rcli_overlay" fi +if [ -f "$DIST_DIR/bin/rcli_camera_preview" ]; then + codesign --force --sign - "$DIST_DIR/bin/rcli_camera_preview" +fi for lib in "$DIST_DIR/lib/"*.dylib; do codesign --force --sign - "$lib" done diff --git a/src/audio/camera_preview.h b/src/audio/camera_preview.h new file mode 100644 index 0000000..fb822de --- /dev/null +++ b/src/audio/camera_preview.h @@ -0,0 +1,36 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +// Launch the camera preview window (floating PIP with live feed). +// Returns 0 on success, -1 on failure. +int camera_preview_start(void); + +// Stop the camera preview window and clean up. +void camera_preview_stop(void); + +// Returns 1 if the camera preview is currently running. +int camera_preview_active(void); + +// Freeze the live feed and capture the current frame to a JPEG file. +// Returns 0 on success, -1 on failure. +int camera_preview_capture(const char* output_path); + +// Capture the current frame to a JPEG file WITHOUT freezing the live feed. +// The camera keeps streaming. Ideal for auto-analysis loops. +// Returns 0 on success, -1 on failure. +int camera_preview_snap(const char* output_path); + +// Freeze the live feed (without capturing). Shows "FROZEN" badge. +// Returns 0 on success, -1 on failure. +int camera_preview_freeze(void); + +// Resume the live camera feed after a freeze. +// Returns 0 on success, -1 on failure. +int camera_preview_unfreeze(void); + +#ifdef __cplusplus +} +#endif diff --git a/src/audio/camera_preview.mm b/src/audio/camera_preview.mm new file mode 100644 index 0000000..ff7cb95 --- /dev/null +++ b/src/audio/camera_preview.mm @@ -0,0 +1,124 @@ +#include "camera_preview.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t g_cam_pid = 0; +static FILE *g_cam_stdin = nullptr; +static FILE *g_cam_stdout = nullptr; +static std::atomic g_cam_active{false}; + +static std::string find_camera_preview_binary() { + char path[1024]; + uint32_t size = sizeof(path); + if (_NSGetExecutablePath(path, &size) == 0) { + std::string dir(path); + auto slash = dir.rfind('/'); + if (slash != std::string::npos) { + std::string candidate = dir.substr(0, slash + 1) + "rcli_camera_preview"; + if (access(candidate.c_str(), X_OK) == 0) return candidate; + } + } + return "rcli_camera_preview"; +} + +static std::string cam_cmd(const char* cmd) { + if (!g_cam_stdin || !g_cam_stdout) return ""; + fprintf(g_cam_stdin, "%s\n", cmd); + fflush(g_cam_stdin); + char buf[256] = {0}; + if (fgets(buf, sizeof(buf), g_cam_stdout)) { + size_t len = strlen(buf); + if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0'; + return std::string(buf); + } + return ""; +} + +int camera_preview_start(void) { + if (g_cam_pid > 0) return 0; + + std::string binary = find_camera_preview_binary(); + + int pipe_in[2], pipe_out[2]; + if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return -1; + + pid_t pid = fork(); + if (pid == 0) { + close(pipe_in[1]); + close(pipe_out[0]); + dup2(pipe_in[0], STDIN_FILENO); + dup2(pipe_out[1], STDOUT_FILENO); + close(pipe_in[0]); + close(pipe_out[1]); + int devnull = open("/dev/null", O_WRONLY); + if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); } + execl(binary.c_str(), "rcli_camera_preview", nullptr); + _exit(1); + } + + close(pipe_in[0]); + close(pipe_out[1]); + g_cam_pid = pid; + g_cam_stdin = fdopen(pipe_in[1], "w"); + g_cam_stdout = fdopen(pipe_out[0], "r"); + + char buf[64] = {0}; + if (g_cam_stdout && fgets(buf, sizeof(buf), g_cam_stdout)) { + g_cam_active.store(true); + return 0; + } + + camera_preview_stop(); + return -1; +} + +void camera_preview_stop(void) { + if (g_cam_pid <= 0) return; + + cam_cmd("quit"); + + if (g_cam_stdin) { fclose(g_cam_stdin); g_cam_stdin = nullptr; } + if (g_cam_stdout) { fclose(g_cam_stdout); g_cam_stdout = nullptr; } + int status; + waitpid(g_cam_pid, &status, 0); + g_cam_pid = 0; + g_cam_active.store(false); +} + +int camera_preview_active(void) { + return g_cam_active.load() ? 1 : 0; +} + +int camera_preview_capture(const char* output_path) { + if (!g_cam_active.load()) return -1; + std::string cmd = std::string("capture ") + output_path; + std::string resp = cam_cmd(cmd.c_str()); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_snap(const char* output_path) { + if (!g_cam_active.load()) return -1; + std::string cmd = std::string("snap ") + output_path; + std::string resp = cam_cmd(cmd.c_str()); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_freeze(void) { + if (!g_cam_active.load()) return -1; + std::string resp = cam_cmd("freeze"); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_unfreeze(void) { + if (!g_cam_active.load()) return -1; + std::string resp = cam_cmd("unfreeze"); + return (resp == "ok") ? 0 : -1; +} diff --git a/src/audio/rcli_camera_preview.m b/src/audio/rcli_camera_preview.m new file mode 100644 index 0000000..c6ce993 --- /dev/null +++ b/src/audio/rcli_camera_preview.m @@ -0,0 +1,337 @@ +// rcli_camera_preview — standalone Cocoa app showing a live camera preview +// in a floating PIP-style window. Communicates with parent RCLI via stdin/stdout. +// +// Commands (one per line on stdin): +// capture → freezes frame, saves JPEG to , replies "ok\n" +// snap → saves JPEG to WITHOUT freezing, replies "ok\n" +// freeze → pauses the live feed on current frame, replies "ok\n" +// unfreeze → resumes live camera feed, replies "ok\n" +// quit → exits + +#import +#import +#import + +// ── Camera preview window ───────────────────────────────────────────── + +@interface CameraPreviewView : NSView { + AVCaptureSession *_session; + AVCaptureVideoDataOutput *_output; + dispatch_queue_t _captureQueue; + CIContext *_ciContext; + CGImageRef _currentFrame; + BOOL _frozen; + NSString *_pendingCapturePath; + NSLock *_frameLock; +} +@property (nonatomic, strong) NSTextField *statusLabel; +@end + +@implementation CameraPreviewView + +- (instancetype)initWithFrame:(NSRect)frame { + self = [super initWithFrame:frame]; + if (self) { + _frameLock = [[NSLock alloc] init]; + _ciContext = [CIContext contextWithOptions:nil]; + _frozen = NO; + _currentFrame = NULL; + self.wantsLayer = YES; + self.layer.cornerRadius = 12; + self.layer.masksToBounds = YES; + self.layer.backgroundColor = [NSColor blackColor].CGColor; + + _statusLabel = [[NSTextField alloc] initWithFrame:NSZeroRect]; + _statusLabel.stringValue = @" RCLI Camera "; + _statusLabel.font = [NSFont systemFontOfSize:11 weight:NSFontWeightHeavy]; + _statusLabel.textColor = [NSColor blackColor]; + _statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + _statusLabel.bezeled = NO; + _statusLabel.editable = NO; + _statusLabel.selectable = NO; + _statusLabel.alignment = NSTextAlignmentCenter; + _statusLabel.wantsLayer = YES; + _statusLabel.layer.cornerRadius = 8; + _statusLabel.layer.masksToBounds = YES; + [_statusLabel sizeToFit]; + [self addSubview:_statusLabel]; + + [self startCamera]; + } + return self; +} + +- (void)layout { + [super layout]; + NSSize sz = _statusLabel.frame.size; + CGFloat x = (self.bounds.size.width - sz.width) / 2; + CGFloat y = self.bounds.size.height - sz.height - 8; + _statusLabel.frame = NSMakeRect(x, y, sz.width + 8, sz.height + 4); +} + +- (void)startCamera { + _session = [[AVCaptureSession alloc] init]; + _session.sessionPreset = AVCaptureSessionPresetHigh; + + AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo]; + if (!device) return; + + NSError *err = nil; + AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&err]; + if (!input) return; + if ([_session canAddInput:input]) [_session addInput:input]; + + _output = [[AVCaptureVideoDataOutput alloc] init]; + _output.videoSettings = @{(id)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)}; + _output.alwaysDiscardsLateVideoFrames = YES; + _captureQueue = dispatch_queue_create("camera.preview", DISPATCH_QUEUE_SERIAL); + [_output setSampleBufferDelegate:self queue:_captureQueue]; + if ([_session canAddOutput:_output]) [_session addOutput:_output]; + + [_session startRunning]; +} + +- (void)captureOutput:(AVCaptureOutput *)output +didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer + fromConnection:(AVCaptureConnection *)connection { + if (_frozen) return; + + CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer); + if (!imageBuffer) return; + + CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer]; + CGRect extent = ciImage.extent; + CGImageRef cgImage = [_ciContext createCGImage:ciImage fromRect:extent]; + if (!cgImage) return; + + [_frameLock lock]; + if (_currentFrame) CGImageRelease(_currentFrame); + _currentFrame = cgImage; + [_frameLock unlock]; + + dispatch_async(dispatch_get_main_queue(), ^{ + [self setNeedsDisplay:YES]; + }); +} + +- (void)drawRect:(NSRect)dirtyRect { + [[NSColor blackColor] set]; + NSRectFill(dirtyRect); + + [_frameLock lock]; + CGImageRef frame = _currentFrame; + if (frame) CGImageRetain(frame); + [_frameLock unlock]; + + if (frame) { + NSGraphicsContext *ctx = [NSGraphicsContext currentContext]; + CGContextRef cgctx = (CGContextRef)[ctx CGContext]; + + CGFloat imgW = CGImageGetWidth(frame); + CGFloat imgH = CGImageGetHeight(frame); + CGFloat viewW = self.bounds.size.width; + CGFloat viewH = self.bounds.size.height; + + CGFloat scale = fmax(viewW / imgW, viewH / imgH); + CGFloat drawW = imgW * scale; + CGFloat drawH = imgH * scale; + CGFloat drawX = (viewW - drawW) / 2; + CGFloat drawY = (viewH - drawH) / 2; + + CGContextDrawImage(cgctx, CGRectMake(drawX, drawY, drawW, drawH), frame); + CGImageRelease(frame); + } + + if (_frozen) { + [[NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:0.08] set]; + NSRectFillUsingOperation(self.bounds, NSCompositingOperationSourceOver); + } + + // Green border + NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:NSInsetRect(self.bounds, 2, 2) + xRadius:12 yRadius:12]; + [border setLineWidth:4]; + [green set]; + [border stroke]; +} + +- (BOOL)saveFrameToPath:(NSString *)path { + [_frameLock lock]; + CGImageRef frame = _currentFrame; + if (frame) CGImageRetain(frame); + [_frameLock unlock]; + + if (!frame) return NO; + + NSURL *url = [NSURL fileURLWithPath:path]; + CGImageDestinationRef dest = CGImageDestinationCreateWithURL( + (__bridge CFURLRef)url, (__bridge CFStringRef)UTTypeJPEG.identifier, 1, NULL); + if (!dest) { CGImageRelease(frame); return NO; } + + NSDictionary *opts = @{(__bridge id)kCGImageDestinationLossyCompressionQuality: @(0.92)}; + CGImageDestinationAddImage(dest, frame, (__bridge CFDictionaryRef)opts); + BOOL ok = CGImageDestinationFinalize(dest); + CFRelease(dest); + CGImageRelease(frame); + return ok; +} + +- (void)freeze { + _frozen = YES; + dispatch_async(dispatch_get_main_queue(), ^{ + self.statusLabel.stringValue = @" FROZEN "; + self.statusLabel.backgroundColor = [NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:1.0]; + [self.statusLabel sizeToFit]; + [self layout]; + [self setNeedsDisplay:YES]; + }); +} + +- (void)unfreeze { + _frozen = NO; + dispatch_async(dispatch_get_main_queue(), ^{ + self.statusLabel.stringValue = @" RCLI Camera "; + self.statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + [self.statusLabel sizeToFit]; + [self layout]; + [self setNeedsDisplay:YES]; + }); +} + +- (void)stopCamera { + [_session stopRunning]; +} + +- (void)dealloc { + [_frameLock lock]; + if (_currentFrame) CGImageRelease(_currentFrame); + _currentFrame = NULL; + [_frameLock unlock]; + [super dealloc]; +} + +@end + +// ── Camera window ───────────────────────────────────────────────────── + +@interface CameraWindow : NSWindow +@end + +@implementation CameraWindow + +- (instancetype)initWithRect:(NSRect)rect { + self = [super initWithContentRect:rect + styleMask:NSWindowStyleMaskBorderless | + NSWindowStyleMaskResizable + backing:NSBackingStoreBuffered + defer:NO]; + if (self) { + self.opaque = NO; + self.backgroundColor = [NSColor clearColor]; + self.level = NSFloatingWindowLevel; + self.hasShadow = YES; + self.movableByWindowBackground = YES; + self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces | + NSWindowCollectionBehaviorStationary; + self.minSize = NSMakeSize(240, 180); + + CameraPreviewView *preview = [[CameraPreviewView alloc] initWithFrame:rect]; + self.contentView = preview; + } + return self; +} + +- (BOOL)canBecomeKeyWindow { return YES; } +- (BOOL)canBecomeMainWindow { return NO; } + +@end + +// ── Stdin reader ────────────────────────────────────────────────────── + +@interface StdinReader : NSObject +@property (nonatomic, strong) CameraWindow *window; +- (void)startReading; +- (void)handleCommand:(NSString *)cmd; +@end + +@implementation StdinReader + +- (void)startReading { + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + char buf[1024]; + while (fgets(buf, sizeof(buf), stdin)) { + NSString *cmd = [[NSString stringWithUTF8String:buf] + stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; + if (cmd.length == 0) continue; + [self performSelectorOnMainThread:@selector(handleCommand:) + withObject:cmd + waitUntilDone:YES]; + } + dispatch_async(dispatch_get_main_queue(), ^{ + [(CameraPreviewView *)self.window.contentView stopCamera]; + [NSApp terminate:nil]; + }); + }); +} + +- (void)handleCommand:(NSString *)cmd { + CameraPreviewView *preview = (CameraPreviewView *)self.window.contentView; + + if ([cmd hasPrefix:@"capture "]) { + NSString *path = [cmd substringFromIndex:8]; + [preview freeze]; + [NSThread sleepForTimeInterval:0.05]; + BOOL ok = [preview saveFrameToPath:path]; + printf("%s\n", ok ? "ok" : "error"); + fflush(stdout); + } else if ([cmd hasPrefix:@"snap "]) { + NSString *path = [cmd substringFromIndex:5]; + BOOL ok = [preview saveFrameToPath:path]; + printf("%s\n", ok ? "ok" : "error"); + fflush(stdout); + } else if ([cmd isEqualToString:@"freeze"]) { + [preview freeze]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"unfreeze"]) { + [preview unfreeze]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"quit"]) { + [preview stopCamera]; + [NSApp terminate:nil]; + } +} + +@end + +// ── Main ────────────────────────────────────────────────────────────── + +int main(int argc, const char *argv[]) { + @autoreleasepool { + NSApplication *app = [NSApplication sharedApplication]; + [app setActivationPolicy:NSApplicationActivationPolicyAccessory]; + + NSScreen *scr = [NSScreen mainScreen]; + NSRect sf = scr.frame; + CGFloat w = 480, h = 360; + CGFloat x = sf.size.width - w - 24; + CGFloat y = sf.size.height - h - 60; + + CameraWindow *win = [[CameraWindow alloc] + initWithRect:NSMakeRect(x, y, w, h)]; + [win makeKeyAndOrderFront:nil]; + [app activateIgnoringOtherApps:YES]; + + StdinReader *reader = [[StdinReader alloc] init]; + reader.window = win; + [reader startReading]; + + printf("ready\n"); + fflush(stdout); + + [app run]; + } + return 0; +} diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h index 7b01d1e..1f1f651 100644 --- a/src/cli/tui_app.h +++ b/src/cli/tui_app.h @@ -14,6 +14,7 @@ #include "engines/metalrt_loader.h" #include "engines/vlm_engine.h" #include "audio/camera_capture.h" +#include "audio/camera_preview.h" #include "audio/screen_capture.h" #include "models/vlm_model_registry.h" #include "core/log.h" @@ -439,9 +440,43 @@ class TuiApp { if (c == "r" || c == "R") { enter_rag_mode(); return true; } if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; } if (c == "p" || c == "P") { enter_personality_mode(); return true; } - // V key: capture photo from camera and analyze with VLM + // V key: toggle camera preview mode (live feed + auto VLM analysis) if (c == "v" || c == "V") { - run_camera_vlm("Describe what you see in this photo in detail."); + if (camera_preview_active()) { + add_system_message("Closing camera..."); + screen_->Post(Event::Custom); + stop_camera_auto_analysis(); + std::thread([this]() { + camera_preview_stop(); + rcli_vlm_exit(engine_); + add_system_message("Camera OFF"); + screen_->Post(Event::Custom); + }).detach(); + } else { + add_system_message("Opening camera, loading VLM..."); + screen_->Post(Event::Custom); + std::thread([this]() { + if (rcli_vlm_init(engine_) != 0) { + add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models"); + screen_->Post(Event::Custom); + return; + } + if (camera_preview_start() != 0) { + add_system_message("Camera preview failed. Check camera permissions in System Settings > Privacy & Security > Camera."); + screen_->Post(Event::Custom); + return; + } + const char* vbe = rcli_vlm_backend_name(engine_); + const char* vmodel = rcli_vlm_model_name(engine_); + std::string msg = "Camera LIVE"; + if (vbe && vbe[0]) + msg += std::string(" \xe2\x80\x94 ") + vmodel + " via " + vbe; + msg += ". Auto-analyzing every ~8s. Speak to ask a specific question"; + add_system_message(msg); + screen_->Post(Event::Custom); + start_camera_auto_analysis(); + }).detach(); + } return true; } // S key: toggle visual mode (VLM only on llama.cpp engine) @@ -581,6 +616,12 @@ class TuiApp { std::string user_text = transcript; add_user_message(user_text); + // Camera preview: route voice to camera VLM analysis + if (camera_preview_active()) { + run_camera_preview_vlm(user_text); + return; + } + // Visual mode: route voice to VLM screen analysis instead of LLM if (screen_capture_overlay_active()) { run_screen_vlm(user_text); @@ -1117,9 +1158,16 @@ class TuiApp { else right.push_back(text("[A] actions ") | dim); right.push_back(text("[C] convo ") | dim); - right.push_back(text("[V] camera ") | dim); + if (camera_preview_active()) { + if (cam_auto_busy_.load()) + right.push_back(text("[V] camera \xf0\x9f\x94\xb4 ") | ftxui::color(ftxui::Color::RedLight)); + else + right.push_back(text("[V] camera LIVE ") | ftxui::color(ftxui::Color::Green)); + } else { + right.push_back(text("[V] camera ") | dim); + } if (screen_capture_overlay_active()) - right.push_back(text("[S] visual ● ") | ftxui::color(ftxui::Color::Green)); + right.push_back(text("[S] visual \xe2\x97\x8f ") | ftxui::color(ftxui::Color::Green)); else right.push_back(text("[S] visual ") | dim); right.push_back(text("[R] RAG ") | dim); @@ -2234,6 +2282,118 @@ class TuiApp { // process_input // ==================================================================== + void start_camera_auto_analysis() { + cam_auto_running_.store(true); + cam_auto_busy_.store(false); + cam_auto_thread_ = std::thread([this]() { + // Small initial delay to let the camera warm up + for (int i = 0; i < 4 && cam_auto_running_.load(); i++) + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + while (cam_auto_running_.load()) { + if (!camera_preview_active()) break; + // Skip if voice/text analysis is in progress, check again in 500ms + if (cam_auto_busy_.load() || voice_state_.load() != VoiceState::IDLE) { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + continue; + } + + cam_auto_busy_.store(true); + std::string photo_path = "/tmp/rcli_cam_auto_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + + if (camera_preview_snap(photo_path.c_str()) != 0) { + cam_auto_busy_.store(false); + continue; + } + + voice_state_ = VoiceState::THINKING; + screen_->Post(Event::Custom); + + std::string accumulated; + auto stream_cb = [](const char* event, const char* data, void* ud) { + auto* accum = static_cast(ud); + if (std::strcmp(event, "token") == 0) + accum->append(data); + }; + int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(), + "Briefly describe what you see. Focus on what's new or interesting. Be concise (1-2 sentences).", + stream_cb, &accumulated); + + if (vlm_rc == 0 && !accumulated.empty()) { + add_response(accumulated, "VLM \xf0\x9f\x93\xb7"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr); + } + + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + + // Brief cooldown after analysis before next cycle + for (int i = 0; i < 4 && cam_auto_running_.load(); i++) + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + }); + } + + void stop_camera_auto_analysis() { + cam_auto_running_.store(false); + if (cam_auto_thread_.joinable()) + cam_auto_thread_.join(); + } + + // User-initiated camera analysis (voice/text) — pauses auto, runs targeted query + void run_camera_preview_vlm(const std::string& prompt) { + cam_auto_busy_.store(true); + add_system_message("Analyzing camera feed..."); + voice_state_ = VoiceState::THINKING; + std::string prompt_copy = prompt; + std::thread([this, prompt_copy]() { + std::string photo_path = "/tmp/rcli_cam_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + + if (camera_preview_snap(photo_path.c_str()) != 0) { + add_response("(Camera capture failed.)", ""); + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + return; + } + + std::string accumulated; + auto stream_cb = [](const char* event, const char* data, void* ud) { + auto* accum = static_cast(ud); + if (std::strcmp(event, "token") == 0) { + accum->append(data); + } + }; + int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(), + prompt_copy.c_str(), stream_cb, &accumulated); + + if (vlm_rc == 0 && !accumulated.empty()) { + add_response(accumulated, "VLM"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(engine_, &stats) == 0) { + char buf[128]; + snprintf(buf, sizeof(buf), "\xe2\x9a\xa1 %.1f tok/s | %d tokens | %.1fs total", + stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec); + add_system_message(buf); + } + } else { + add_response("(VLM analysis failed.)", ""); + } + + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + }).detach(); + } + void run_camera_vlm(const std::string& prompt) { add_system_message("Capturing photo from camera..."); voice_state_ = VoiceState::THINKING; @@ -2383,6 +2543,9 @@ class TuiApp { add_system_message(" P Personality"); add_system_message(" R RAG panel"); add_system_message(" D Delete / cleanup models"); + add_system_message("--- Vision ---"); + add_system_message(" V Camera preview (toggle live feed + VLM)"); + add_system_message(" S Visual mode (screen overlay + VLM)"); add_system_message("--- Toggles ---"); add_system_message(" T Tool call trace (show tool calls & results)"); @@ -2421,7 +2584,10 @@ class TuiApp { } if (cmd == "camera" || cmd == "photo" || cmd == "webcam") { - run_camera_vlm("Describe what you see in this photo in detail."); + if (camera_preview_active()) + run_camera_preview_vlm("Describe what you see in this photo in detail."); + else + run_camera_vlm("Describe what you see in this photo in detail."); return; } @@ -2607,6 +2773,12 @@ class TuiApp { } } + // Camera preview active: route typed questions to camera VLM + if (camera_preview_active()) { + run_camera_preview_vlm(input); + return; + } + // Run LLM (or RAG+LLM) in background thread to keep UI responsive voice_state_ = VoiceState::THINKING; std::string input_copy = input; @@ -2865,6 +3037,12 @@ class TuiApp { ftxui::Color personality_msg_color_; + // Camera auto-analysis state + std::atomic cam_auto_running_{false}; + std::atomic cam_auto_busy_{false}; + std::thread cam_auto_thread_; + std::string cam_last_snap_path_; + // RAG panel state struct RagOption { std::string name, action; }; bool rag_mode_ = false;