diff --git a/CMakeLists.txt b/CMakeLists.txt index 2719674..2b0c50d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,10 +103,12 @@ add_library(rcli STATIC src/engines/metalrt_engine.cpp src/engines/metalrt_stt_engine.cpp src/engines/metalrt_tts_engine.cpp + src/engines/metalrt_vlm_engine.cpp src/engines/vlm_engine.cpp src/audio/audio_io.cpp src/audio/mic_permission.mm src/audio/camera_capture.mm + src/audio/camera_preview.mm src/audio/screen_capture.mm src/pipeline/orchestrator.cpp src/pipeline/sentence_detector.cpp @@ -140,7 +142,7 @@ add_library(rcli STATIC src/api/rcli_api.cpp ) -set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm +set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/camera_preview.mm src/audio/screen_capture.mm PROPERTIES LANGUAGE CXX) target_include_directories(rcli PUBLIC @@ -221,6 +223,32 @@ set_target_properties(rcli_overlay PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) +# ============================================================================= +# rcli_camera_preview — standalone Cocoa helper for live camera preview window +# ============================================================================= +add_executable(rcli_camera_preview + src/audio/rcli_camera_preview.m +) + +set_source_files_properties(src/audio/rcli_camera_preview.m PROPERTIES LANGUAGE CXX) + +target_compile_options(rcli_camera_preview PRIVATE -x objective-c++) + +target_link_libraries(rcli_camera_preview PRIVATE + "-framework AppKit" + "-framework AVFoundation" + "-framework CoreMedia" + "-framework CoreVideo" + "-framework CoreImage" + "-framework ImageIO" + "-framework UniformTypeIdentifiers" +) + +set_target_properties(rcli_camera_preview PROPERTIES + OUTPUT_NAME "rcli_camera_preview" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) + # ============================================================================= # rcli_test — test executable # ============================================================================= diff --git a/Formula/rcli.rb b/Formula/rcli.rb index 7937734..aad98ef 100644 --- a/Formula/rcli.rb +++ b/Formula/rcli.rb @@ -12,6 +12,7 @@ class Rcli < Formula def install bin.install "bin/rcli" bin.install "bin/rcli_overlay" if File.exist? "bin/rcli_overlay" + bin.install "bin/rcli_camera_preview" if File.exist? "bin/rcli_camera_preview" lib.install Dir["lib/*.dylib"] end diff --git a/install.sh b/install.sh index bd5880b..12ca047 100755 --- a/install.sh +++ b/install.sh @@ -70,6 +70,7 @@ else mkdir -p "$CELLAR/bin" "$CELLAR/lib" 2>/dev/null || sudo mkdir -p "$CELLAR/bin" "$CELLAR/lib" cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli "$CELLAR/bin/" cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_overlay "$CELLAR/bin/" 2>/dev/null || true + cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/bin/rcli_camera_preview "$CELLAR/bin/" 2>/dev/null || true cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/" 2>/dev/null || sudo cp "$WORKDIR"/rcli-*/lib/*.dylib "$CELLAR/lib/" brew link --overwrite "$FORMULA" 2>/dev/null || sudo brew link --overwrite "$FORMULA" diff --git a/scripts/package.sh b/scripts/package.sh index e238737..1f29c9f 100755 --- a/scripts/package.sh +++ b/scripts/package.sh @@ -27,6 +27,10 @@ if [ -f "$BUILD_DIR/rcli_overlay" ]; then cp "$BUILD_DIR/rcli_overlay" "$DIST_DIR/bin/rcli_overlay" echo " + bin/rcli_overlay" fi +if [ -f "$BUILD_DIR/rcli_camera_preview" ]; then + cp "$BUILD_DIR/rcli_camera_preview" "$DIST_DIR/bin/rcli_camera_preview" + echo " + bin/rcli_camera_preview" +fi # --- Collect dylibs --- DYLIBS=( @@ -155,6 +159,9 @@ codesign --force --sign - "$BINARY" if [ -f "$DIST_DIR/bin/rcli_overlay" ]; then codesign --force --sign - "$DIST_DIR/bin/rcli_overlay" fi +if [ -f "$DIST_DIR/bin/rcli_camera_preview" ]; then + codesign --force --sign - "$DIST_DIR/bin/rcli_camera_preview" +fi for lib in "$DIST_DIR/lib/"*.dylib; do codesign --force --sign - "$lib" done diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp index f292c78..aaf2d1d 100644 --- a/src/api/rcli_api.cpp +++ b/src/api/rcli_api.cpp @@ -41,6 +41,7 @@ extern char** environ; #include "actions/action_registry.h" #include "actions/macos_actions.h" #include "engines/vlm_engine.h" +#include "engines/metalrt_vlm_engine.h" #include "models/vlm_model_registry.h" using namespace rastack; @@ -117,10 +118,12 @@ struct RCLIEngine { int ctx_main_prompt_tokens = 0; // VLM (Vision Language Model) subsystem - VlmEngine vlm_engine; + VlmEngine vlm_engine; // llama.cpp backend + MetalRTVlmEngine metalrt_vlm_engine; // MetalRT backend bool vlm_initialized = false; + bool vlm_use_metalrt = false; // which backend is active std::string last_vlm_response; - std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT" + std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT (Metal GPU)" std::string vlm_model_name; // e.g. "Qwen3 VL 2B" std::mutex mutex; @@ -1065,8 +1068,7 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u if (!engine->vlm_initialized) { if (vlm_init_locked(engine) != 0) { return "I can see you're asking about your screen, but VLM isn't available. " - "It requires the llama.cpp engine and a VLM model. " - "Switch with: rcli engine llamacpp, then download a model: rcli models vlm"; + "Download a VLM model with: rcli models vlm"; } } @@ -1076,7 +1078,12 @@ static std::string handle_screen_intent(RCLIEngine* engine, const std::string& u vlm_prompt = "Describe what you see on this screen in detail."; } - std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr); + std::string result; + if (engine->vlm_use_metalrt) { + result = engine->metalrt_vlm_engine.analyze_image(path, vlm_prompt); + } else { + result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr); + } if (result.empty()) { return "I captured your screen but the analysis failed. Please try again."; @@ -2983,8 +2990,47 @@ static bool safe_download(const std::string& url, const std::string& dest) { return WIFEXITED(status) && WEXITSTATUS(status) == 0; } +// Find a MetalRT VLM model directory (MLX-format weights). +// Searches HuggingFace cache for known models. +static std::string find_metalrt_vlm_model_dir() { + const char* home = getenv("HOME"); + if (!home) return ""; + + static const char* hf_repos[] = { + "models--mlx-community--Qwen3-VL-2B-Instruct-4bit", + "models--mlx-community--LFM2.5-VL-1.6B-MLX-6bit", + }; + + struct stat st; + std::string hf_base = std::string(home) + "/.cache/huggingface/hub"; + + for (const char* repo : hf_repos) { + std::string snapshots_dir = hf_base + "/" + repo + "/snapshots"; + if (stat(snapshots_dir.c_str(), &st) != 0) continue; + + FILE* p = popen(("ls -1t '" + snapshots_dir + "' 2>/dev/null | head -1").c_str(), "r"); + if (!p) continue; + char buf[256]; + if (!fgets(buf, sizeof(buf), p)) { pclose(p); continue; } + pclose(p); + + std::string snap(buf); + while (!snap.empty() && (snap.back() == '\n' || snap.back() == '\r')) + snap.pop_back(); + if (snap.empty()) continue; + + std::string model_dir = snapshots_dir + "/" + snap; + std::string safetensors = model_dir + "/model.safetensors"; + if (stat(safetensors.c_str(), &st) == 0) { + LOG_DEBUG("VLM", "Found MetalRT VLM model at %s", model_dir.c_str()); + return model_dir; + } + } + + return ""; +} + // Internal init (caller must hold engine->mutex) -// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon. static int vlm_init_locked(RCLIEngine* engine) { if (engine->vlm_initialized) return 0; @@ -2995,13 +3041,34 @@ static int vlm_init_locked(RCLIEngine* engine) { engine->models_dir = "./models"; } - // VLM requires the llama.cpp engine + // --- Try MetalRT VLM backend first (when on MetalRT engine) --- if (engine->initialized && engine->pipeline.using_metalrt()) { - LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp"); - return -1; + auto& loader = MetalRTLoader::instance(); + if (loader.is_loaded() && loader.has_vision()) { + std::string model_dir = find_metalrt_vlm_model_dir(); + if (!model_dir.empty()) { + MetalRTVlmConfig mrt_config; + mrt_config.model_dir = model_dir; + if (engine->metalrt_vlm_engine.init(mrt_config)) { + engine->vlm_initialized = true; + engine->vlm_use_metalrt = true; + engine->vlm_backend_name = "MetalRT (Metal GPU)"; + engine->vlm_model_name = engine->metalrt_vlm_engine.model_name(); + if (engine->vlm_model_name.empty()) + engine->vlm_model_name = "Qwen3 VL 2B"; + LOG_INFO("VLM", "VLM engine ready — %s via MetalRT (Metal GPU)", + engine->vlm_model_name.c_str()); + return 0; + } + LOG_WARN("VLM", "MetalRT VLM init failed, falling back to llama.cpp"); + } else { + LOG_WARN("VLM", "No MetalRT VLM model found in HF cache, falling back to llama.cpp"); + } + } + // Fall through to llama.cpp instead of hard-failing } - // Check if any VLM model is installed (on-demand, no auto-download) + // --- llama.cpp VLM backend --- auto vlm_models = rcli::all_vlm_models(); rcli::VlmModelDef model_def; bool found = false; @@ -3019,7 +3086,6 @@ static int vlm_init_locked(RCLIEngine* engine) { return -1; } - // Initialize VLM engine with the installed model VlmConfig config; config.model_path = engine->models_dir + "/" + model_def.model_filename; config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename; @@ -3036,6 +3102,7 @@ static int vlm_init_locked(RCLIEngine* engine) { } engine->vlm_initialized = true; + engine->vlm_use_metalrt = false; engine->vlm_backend_name = "llama.cpp (Metal GPU)"; engine->vlm_model_name = model_def.name; LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str()); @@ -3056,7 +3123,7 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch if (!engine->vlm_initialized) { if (vlm_init_locked(engine) != 0) { - engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm)."; + engine->last_vlm_response = "VLM not available. Download a VLM model with: rcli models vlm"; return engine->last_vlm_response.c_str(); } } @@ -3065,16 +3132,17 @@ const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const ch ? std::string(prompt) : "Describe this image in detail."; - { - std::string result = engine->vlm_engine.analyze_image( + std::string result; + if (engine->vlm_use_metalrt) { + result = engine->metalrt_vlm_engine.analyze_image( + std::string(image_path), text_prompt); + } else { + result = engine->vlm_engine.analyze_image( std::string(image_path), text_prompt, nullptr); - - if (result.empty()) { - engine->last_vlm_response = "Error: Failed to analyze image."; - } else { - engine->last_vlm_response = result; - } } + + engine->last_vlm_response = result.empty() + ? "Error: Failed to analyze image." : result; return engine->last_vlm_response.c_str(); } @@ -3101,12 +3169,21 @@ int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) { auto* engine = static_cast(handle); if (!engine->vlm_initialized) return -1; - auto& s = engine->vlm_engine.last_stats(); - out_stats->gen_tok_per_sec = s.gen_tps(); - out_stats->generated_tokens = static_cast(s.generated_tokens); - out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6; - out_stats->image_encode_ms = s.image_encode_us / 1000.0; - out_stats->first_token_ms = s.first_token_us / 1000.0; + if (engine->vlm_use_metalrt) { + auto& s = engine->metalrt_vlm_engine.last_stats(); + out_stats->gen_tok_per_sec = s.tps; + out_stats->generated_tokens = s.generated_tokens; + out_stats->total_time_sec = (s.vision_encode_ms + s.prefill_ms + s.decode_ms) / 1000.0; + out_stats->image_encode_ms = s.vision_encode_ms; + out_stats->first_token_ms = s.prefill_ms; + } else { + auto& s = engine->vlm_engine.last_stats(); + out_stats->gen_tok_per_sec = s.gen_tps(); + out_stats->generated_tokens = static_cast(s.generated_tokens); + out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6; + out_stats->image_encode_ms = s.image_encode_us / 1000.0; + out_stats->first_token_ms = s.first_token_us / 1000.0; + } return 0; } @@ -3128,11 +3205,16 @@ int rcli_vlm_exit(RCLIHandle handle) { auto* engine = static_cast(handle); std::lock_guard lock(engine->mutex); - if (engine->vlm_engine.is_initialized()) { - engine->vlm_engine.shutdown(); + if (engine->vlm_use_metalrt) { + if (engine->metalrt_vlm_engine.is_initialized()) + engine->metalrt_vlm_engine.shutdown(); + } else { + if (engine->vlm_engine.is_initialized()) + engine->vlm_engine.shutdown(); } engine->vlm_initialized = false; + engine->vlm_use_metalrt = false; engine->vlm_backend_name.clear(); engine->vlm_model_name.clear(); LOG_INFO("VLM", "VLM unloaded"); @@ -3157,29 +3239,58 @@ int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path, std::string text_prompt = (prompt && prompt[0]) ? std::string(prompt) : "Describe this image in detail."; - // llama.cpp VLM streaming path - rastack::TokenCallback token_cb = nullptr; - if (callback) { - token_cb = [callback, user_data](const rastack::TokenOutput& tok) { - if (!tok.text.empty()) { - callback("token", tok.text.c_str(), user_data); - } - }; - } + std::string result; + + if (engine->vlm_use_metalrt) { + // MetalRT VLM streaming path + rastack::TokenCallback token_cb = nullptr; + if (callback) { + token_cb = [callback, user_data](const rastack::TokenOutput& tok) { + if (!tok.text.empty()) { + callback("token", tok.text.c_str(), user_data); + } + }; + } - std::string result = engine->vlm_engine.analyze_image( - std::string(image_path), text_prompt, token_cb); + result = engine->metalrt_vlm_engine.analyze_image_stream( + std::string(image_path), text_prompt, token_cb); + + if (callback) { + auto& s = engine->metalrt_vlm_engine.last_stats(); + char stats_buf[256]; + snprintf(stats_buf, sizeof(stats_buf), + "{\"tps\":%.1f,\"tokens\":%d,\"vision_encode_ms\":%.1f}", + s.tps, s.generated_tokens, s.vision_encode_ms); + callback("stats", stats_buf, user_data); + } + } else { + // llama.cpp VLM streaming path + rastack::TokenCallback token_cb = nullptr; + if (callback) { + token_cb = [callback, user_data](const rastack::TokenOutput& tok) { + if (!tok.text.empty()) { + callback("token", tok.text.c_str(), user_data); + } + }; + } + + result = engine->vlm_engine.analyze_image( + std::string(image_path), text_prompt, token_cb); + + if (callback) { + auto& s = engine->vlm_engine.last_stats(); + char stats_buf[256]; + snprintf(stats_buf, sizeof(stats_buf), + "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}", + s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0); + callback("stats", stats_buf, user_data); + } + } engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result; if (callback) { callback("response", engine->last_vlm_response.c_str(), user_data); - auto& s = engine->vlm_engine.last_stats(); - char stats_buf[256]; - snprintf(stats_buf, sizeof(stats_buf), - "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}", - s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0); - callback("stats", stats_buf, user_data); } return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0; diff --git a/src/audio/camera_preview.h b/src/audio/camera_preview.h new file mode 100644 index 0000000..fb822de --- /dev/null +++ b/src/audio/camera_preview.h @@ -0,0 +1,36 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +// Launch the camera preview window (floating PIP with live feed). +// Returns 0 on success, -1 on failure. +int camera_preview_start(void); + +// Stop the camera preview window and clean up. +void camera_preview_stop(void); + +// Returns 1 if the camera preview is currently running. +int camera_preview_active(void); + +// Freeze the live feed and capture the current frame to a JPEG file. +// Returns 0 on success, -1 on failure. +int camera_preview_capture(const char* output_path); + +// Capture the current frame to a JPEG file WITHOUT freezing the live feed. +// The camera keeps streaming. Ideal for auto-analysis loops. +// Returns 0 on success, -1 on failure. +int camera_preview_snap(const char* output_path); + +// Freeze the live feed (without capturing). Shows "FROZEN" badge. +// Returns 0 on success, -1 on failure. +int camera_preview_freeze(void); + +// Resume the live camera feed after a freeze. +// Returns 0 on success, -1 on failure. +int camera_preview_unfreeze(void); + +#ifdef __cplusplus +} +#endif diff --git a/src/audio/camera_preview.mm b/src/audio/camera_preview.mm new file mode 100644 index 0000000..ff7cb95 --- /dev/null +++ b/src/audio/camera_preview.mm @@ -0,0 +1,124 @@ +#include "camera_preview.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static pid_t g_cam_pid = 0; +static FILE *g_cam_stdin = nullptr; +static FILE *g_cam_stdout = nullptr; +static std::atomic g_cam_active{false}; + +static std::string find_camera_preview_binary() { + char path[1024]; + uint32_t size = sizeof(path); + if (_NSGetExecutablePath(path, &size) == 0) { + std::string dir(path); + auto slash = dir.rfind('/'); + if (slash != std::string::npos) { + std::string candidate = dir.substr(0, slash + 1) + "rcli_camera_preview"; + if (access(candidate.c_str(), X_OK) == 0) return candidate; + } + } + return "rcli_camera_preview"; +} + +static std::string cam_cmd(const char* cmd) { + if (!g_cam_stdin || !g_cam_stdout) return ""; + fprintf(g_cam_stdin, "%s\n", cmd); + fflush(g_cam_stdin); + char buf[256] = {0}; + if (fgets(buf, sizeof(buf), g_cam_stdout)) { + size_t len = strlen(buf); + if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0'; + return std::string(buf); + } + return ""; +} + +int camera_preview_start(void) { + if (g_cam_pid > 0) return 0; + + std::string binary = find_camera_preview_binary(); + + int pipe_in[2], pipe_out[2]; + if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return -1; + + pid_t pid = fork(); + if (pid == 0) { + close(pipe_in[1]); + close(pipe_out[0]); + dup2(pipe_in[0], STDIN_FILENO); + dup2(pipe_out[1], STDOUT_FILENO); + close(pipe_in[0]); + close(pipe_out[1]); + int devnull = open("/dev/null", O_WRONLY); + if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); } + execl(binary.c_str(), "rcli_camera_preview", nullptr); + _exit(1); + } + + close(pipe_in[0]); + close(pipe_out[1]); + g_cam_pid = pid; + g_cam_stdin = fdopen(pipe_in[1], "w"); + g_cam_stdout = fdopen(pipe_out[0], "r"); + + char buf[64] = {0}; + if (g_cam_stdout && fgets(buf, sizeof(buf), g_cam_stdout)) { + g_cam_active.store(true); + return 0; + } + + camera_preview_stop(); + return -1; +} + +void camera_preview_stop(void) { + if (g_cam_pid <= 0) return; + + cam_cmd("quit"); + + if (g_cam_stdin) { fclose(g_cam_stdin); g_cam_stdin = nullptr; } + if (g_cam_stdout) { fclose(g_cam_stdout); g_cam_stdout = nullptr; } + int status; + waitpid(g_cam_pid, &status, 0); + g_cam_pid = 0; + g_cam_active.store(false); +} + +int camera_preview_active(void) { + return g_cam_active.load() ? 1 : 0; +} + +int camera_preview_capture(const char* output_path) { + if (!g_cam_active.load()) return -1; + std::string cmd = std::string("capture ") + output_path; + std::string resp = cam_cmd(cmd.c_str()); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_snap(const char* output_path) { + if (!g_cam_active.load()) return -1; + std::string cmd = std::string("snap ") + output_path; + std::string resp = cam_cmd(cmd.c_str()); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_freeze(void) { + if (!g_cam_active.load()) return -1; + std::string resp = cam_cmd("freeze"); + return (resp == "ok") ? 0 : -1; +} + +int camera_preview_unfreeze(void) { + if (!g_cam_active.load()) return -1; + std::string resp = cam_cmd("unfreeze"); + return (resp == "ok") ? 0 : -1; +} diff --git a/src/audio/rcli_camera_preview.m b/src/audio/rcli_camera_preview.m new file mode 100644 index 0000000..c6ce993 --- /dev/null +++ b/src/audio/rcli_camera_preview.m @@ -0,0 +1,337 @@ +// rcli_camera_preview — standalone Cocoa app showing a live camera preview +// in a floating PIP-style window. Communicates with parent RCLI via stdin/stdout. +// +// Commands (one per line on stdin): +// capture → freezes frame, saves JPEG to , replies "ok\n" +// snap → saves JPEG to WITHOUT freezing, replies "ok\n" +// freeze → pauses the live feed on current frame, replies "ok\n" +// unfreeze → resumes live camera feed, replies "ok\n" +// quit → exits + +#import +#import +#import + +// ── Camera preview window ───────────────────────────────────────────── + +@interface CameraPreviewView : NSView { + AVCaptureSession *_session; + AVCaptureVideoDataOutput *_output; + dispatch_queue_t _captureQueue; + CIContext *_ciContext; + CGImageRef _currentFrame; + BOOL _frozen; + NSString *_pendingCapturePath; + NSLock *_frameLock; +} +@property (nonatomic, strong) NSTextField *statusLabel; +@end + +@implementation CameraPreviewView + +- (instancetype)initWithFrame:(NSRect)frame { + self = [super initWithFrame:frame]; + if (self) { + _frameLock = [[NSLock alloc] init]; + _ciContext = [CIContext contextWithOptions:nil]; + _frozen = NO; + _currentFrame = NULL; + self.wantsLayer = YES; + self.layer.cornerRadius = 12; + self.layer.masksToBounds = YES; + self.layer.backgroundColor = [NSColor blackColor].CGColor; + + _statusLabel = [[NSTextField alloc] initWithFrame:NSZeroRect]; + _statusLabel.stringValue = @" RCLI Camera "; + _statusLabel.font = [NSFont systemFontOfSize:11 weight:NSFontWeightHeavy]; + _statusLabel.textColor = [NSColor blackColor]; + _statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + _statusLabel.bezeled = NO; + _statusLabel.editable = NO; + _statusLabel.selectable = NO; + _statusLabel.alignment = NSTextAlignmentCenter; + _statusLabel.wantsLayer = YES; + _statusLabel.layer.cornerRadius = 8; + _statusLabel.layer.masksToBounds = YES; + [_statusLabel sizeToFit]; + [self addSubview:_statusLabel]; + + [self startCamera]; + } + return self; +} + +- (void)layout { + [super layout]; + NSSize sz = _statusLabel.frame.size; + CGFloat x = (self.bounds.size.width - sz.width) / 2; + CGFloat y = self.bounds.size.height - sz.height - 8; + _statusLabel.frame = NSMakeRect(x, y, sz.width + 8, sz.height + 4); +} + +- (void)startCamera { + _session = [[AVCaptureSession alloc] init]; + _session.sessionPreset = AVCaptureSessionPresetHigh; + + AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo]; + if (!device) return; + + NSError *err = nil; + AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&err]; + if (!input) return; + if ([_session canAddInput:input]) [_session addInput:input]; + + _output = [[AVCaptureVideoDataOutput alloc] init]; + _output.videoSettings = @{(id)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)}; + _output.alwaysDiscardsLateVideoFrames = YES; + _captureQueue = dispatch_queue_create("camera.preview", DISPATCH_QUEUE_SERIAL); + [_output setSampleBufferDelegate:self queue:_captureQueue]; + if ([_session canAddOutput:_output]) [_session addOutput:_output]; + + [_session startRunning]; +} + +- (void)captureOutput:(AVCaptureOutput *)output +didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer + fromConnection:(AVCaptureConnection *)connection { + if (_frozen) return; + + CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer); + if (!imageBuffer) return; + + CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer]; + CGRect extent = ciImage.extent; + CGImageRef cgImage = [_ciContext createCGImage:ciImage fromRect:extent]; + if (!cgImage) return; + + [_frameLock lock]; + if (_currentFrame) CGImageRelease(_currentFrame); + _currentFrame = cgImage; + [_frameLock unlock]; + + dispatch_async(dispatch_get_main_queue(), ^{ + [self setNeedsDisplay:YES]; + }); +} + +- (void)drawRect:(NSRect)dirtyRect { + [[NSColor blackColor] set]; + NSRectFill(dirtyRect); + + [_frameLock lock]; + CGImageRef frame = _currentFrame; + if (frame) CGImageRetain(frame); + [_frameLock unlock]; + + if (frame) { + NSGraphicsContext *ctx = [NSGraphicsContext currentContext]; + CGContextRef cgctx = (CGContextRef)[ctx CGContext]; + + CGFloat imgW = CGImageGetWidth(frame); + CGFloat imgH = CGImageGetHeight(frame); + CGFloat viewW = self.bounds.size.width; + CGFloat viewH = self.bounds.size.height; + + CGFloat scale = fmax(viewW / imgW, viewH / imgH); + CGFloat drawW = imgW * scale; + CGFloat drawH = imgH * scale; + CGFloat drawX = (viewW - drawW) / 2; + CGFloat drawY = (viewH - drawH) / 2; + + CGContextDrawImage(cgctx, CGRectMake(drawX, drawY, drawW, drawH), frame); + CGImageRelease(frame); + } + + if (_frozen) { + [[NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:0.08] set]; + NSRectFillUsingOperation(self.bounds, NSCompositingOperationSourceOver); + } + + // Green border + NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:NSInsetRect(self.bounds, 2, 2) + xRadius:12 yRadius:12]; + [border setLineWidth:4]; + [green set]; + [border stroke]; +} + +- (BOOL)saveFrameToPath:(NSString *)path { + [_frameLock lock]; + CGImageRef frame = _currentFrame; + if (frame) CGImageRetain(frame); + [_frameLock unlock]; + + if (!frame) return NO; + + NSURL *url = [NSURL fileURLWithPath:path]; + CGImageDestinationRef dest = CGImageDestinationCreateWithURL( + (__bridge CFURLRef)url, (__bridge CFStringRef)UTTypeJPEG.identifier, 1, NULL); + if (!dest) { CGImageRelease(frame); return NO; } + + NSDictionary *opts = @{(__bridge id)kCGImageDestinationLossyCompressionQuality: @(0.92)}; + CGImageDestinationAddImage(dest, frame, (__bridge CFDictionaryRef)opts); + BOOL ok = CGImageDestinationFinalize(dest); + CFRelease(dest); + CGImageRelease(frame); + return ok; +} + +- (void)freeze { + _frozen = YES; + dispatch_async(dispatch_get_main_queue(), ^{ + self.statusLabel.stringValue = @" FROZEN "; + self.statusLabel.backgroundColor = [NSColor colorWithRed:1.0 green:0.3 blue:0.2 alpha:1.0]; + [self.statusLabel sizeToFit]; + [self layout]; + [self setNeedsDisplay:YES]; + }); +} + +- (void)unfreeze { + _frozen = NO; + dispatch_async(dispatch_get_main_queue(), ^{ + self.statusLabel.stringValue = @" RCLI Camera "; + self.statusLabel.backgroundColor = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + [self.statusLabel sizeToFit]; + [self layout]; + [self setNeedsDisplay:YES]; + }); +} + +- (void)stopCamera { + [_session stopRunning]; +} + +- (void)dealloc { + [_frameLock lock]; + if (_currentFrame) CGImageRelease(_currentFrame); + _currentFrame = NULL; + [_frameLock unlock]; + [super dealloc]; +} + +@end + +// ── Camera window ───────────────────────────────────────────────────── + +@interface CameraWindow : NSWindow +@end + +@implementation CameraWindow + +- (instancetype)initWithRect:(NSRect)rect { + self = [super initWithContentRect:rect + styleMask:NSWindowStyleMaskBorderless | + NSWindowStyleMaskResizable + backing:NSBackingStoreBuffered + defer:NO]; + if (self) { + self.opaque = NO; + self.backgroundColor = [NSColor clearColor]; + self.level = NSFloatingWindowLevel; + self.hasShadow = YES; + self.movableByWindowBackground = YES; + self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces | + NSWindowCollectionBehaviorStationary; + self.minSize = NSMakeSize(240, 180); + + CameraPreviewView *preview = [[CameraPreviewView alloc] initWithFrame:rect]; + self.contentView = preview; + } + return self; +} + +- (BOOL)canBecomeKeyWindow { return YES; } +- (BOOL)canBecomeMainWindow { return NO; } + +@end + +// ── Stdin reader ────────────────────────────────────────────────────── + +@interface StdinReader : NSObject +@property (nonatomic, strong) CameraWindow *window; +- (void)startReading; +- (void)handleCommand:(NSString *)cmd; +@end + +@implementation StdinReader + +- (void)startReading { + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + char buf[1024]; + while (fgets(buf, sizeof(buf), stdin)) { + NSString *cmd = [[NSString stringWithUTF8String:buf] + stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; + if (cmd.length == 0) continue; + [self performSelectorOnMainThread:@selector(handleCommand:) + withObject:cmd + waitUntilDone:YES]; + } + dispatch_async(dispatch_get_main_queue(), ^{ + [(CameraPreviewView *)self.window.contentView stopCamera]; + [NSApp terminate:nil]; + }); + }); +} + +- (void)handleCommand:(NSString *)cmd { + CameraPreviewView *preview = (CameraPreviewView *)self.window.contentView; + + if ([cmd hasPrefix:@"capture "]) { + NSString *path = [cmd substringFromIndex:8]; + [preview freeze]; + [NSThread sleepForTimeInterval:0.05]; + BOOL ok = [preview saveFrameToPath:path]; + printf("%s\n", ok ? "ok" : "error"); + fflush(stdout); + } else if ([cmd hasPrefix:@"snap "]) { + NSString *path = [cmd substringFromIndex:5]; + BOOL ok = [preview saveFrameToPath:path]; + printf("%s\n", ok ? "ok" : "error"); + fflush(stdout); + } else if ([cmd isEqualToString:@"freeze"]) { + [preview freeze]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"unfreeze"]) { + [preview unfreeze]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"quit"]) { + [preview stopCamera]; + [NSApp terminate:nil]; + } +} + +@end + +// ── Main ────────────────────────────────────────────────────────────── + +int main(int argc, const char *argv[]) { + @autoreleasepool { + NSApplication *app = [NSApplication sharedApplication]; + [app setActivationPolicy:NSApplicationActivationPolicyAccessory]; + + NSScreen *scr = [NSScreen mainScreen]; + NSRect sf = scr.frame; + CGFloat w = 480, h = 360; + CGFloat x = sf.size.width - w - 24; + CGFloat y = sf.size.height - h - 60; + + CameraWindow *win = [[CameraWindow alloc] + initWithRect:NSMakeRect(x, y, w, h)]; + [win makeKeyAndOrderFront:nil]; + [app activateIgnoringOtherApps:YES]; + + StdinReader *reader = [[StdinReader alloc] init]; + reader.window = win; + [reader startReading]; + + printf("ready\n"); + fflush(stdout); + + [app run]; + } + return 0; +} diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m index 274a3fc..885cf19 100644 --- a/src/audio/rcli_overlay.m +++ b/src/audio/rcli_overlay.m @@ -9,10 +9,10 @@ #import -static const CGFloat kBorder = 6.0; -static const CGFloat kRadius = 12.0; -static const CGFloat kHandle = 18.0; // corner handle size -static const CGFloat kEdgeGrab = 14.0; // invisible edge grab zone +static const CGFloat kBorder = 8.0; +static const CGFloat kRadius = 14.0; +static const CGFloat kHandle = 28.0; // corner handle size +static const CGFloat kEdgeGrab = 20.0; // invisible edge grab zone // ── Custom view: bold border + corner handles + label pill ───────────── @interface OverlayView : NSView @@ -25,23 +25,37 @@ - (void)drawRect:(NSRect)dirtyRect { NSRectFill(dirtyRect); NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder); - NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92]; + NSColor *green = [NSColor colorWithRed:0.1 green:0.85 blue:0.4 alpha:1.0]; + + // Outer glow — wide, soft, two layers for depth + NSBezierPath *glow2 = [NSBezierPath bezierPathWithRoundedRect:inner + xRadius:kRadius yRadius:kRadius]; + [glow2 setLineWidth:kBorder + 16]; + [[green colorWithAlphaComponent:0.08] set]; + [glow2 stroke]; - // Outer glow NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner xRadius:kRadius yRadius:kRadius]; - [glow setLineWidth:kBorder + 6]; - [[green colorWithAlphaComponent:0.12] set]; + [glow setLineWidth:kBorder + 8]; + [[green colorWithAlphaComponent:0.18] set]; [glow stroke]; - // Main border — solid, thick, rounded + // Main border — bold, solid, rounded NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner xRadius:kRadius yRadius:kRadius]; [border setLineWidth:kBorder]; [green set]; [border stroke]; - // Corner handles — filled rounded squares with white dot + // Inner highlight — thin white line for depth + NSRect innerHL = NSInsetRect(inner, 1.5, 1.5); + NSBezierPath *highlight = [NSBezierPath bezierPathWithRoundedRect:innerHL + xRadius:kRadius - 1.5 yRadius:kRadius - 1.5]; + [highlight setLineWidth:1.0]; + [[NSColor colorWithWhite:1.0 alpha:0.15] set]; + [highlight stroke]; + + // Corner handles — large rounded squares with shadow + white center dot CGFloat hs = kHandle; CGFloat off = kBorder / 2; NSRect corners[4] = { @@ -51,20 +65,50 @@ - (void)drawRect:(NSRect)dirtyRect { NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs), }; for (int i = 0; i < 4; i++) { + // Drop shadow + NSRect shadowRect = NSOffsetRect(corners[i], 0, -1); + NSBezierPath *shadow = [NSBezierPath bezierPathWithRoundedRect:shadowRect + xRadius:6 yRadius:6]; + [[NSColor colorWithWhite:0.0 alpha:0.25] set]; + [shadow fill]; + + // Handle body NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i] - xRadius:4 yRadius:4]; + xRadius:6 yRadius:6]; [green set]; [h fill]; + + // White border on handle + [h setLineWidth:1.5]; + [[NSColor colorWithWhite:1.0 alpha:0.4] set]; + [h stroke]; + // White center dot - NSRect dot = NSInsetRect(corners[i], 5, 5); - [[NSColor colorWithWhite:1.0 alpha:0.85] set]; + NSRect dot = NSInsetRect(corners[i], hs * 0.3, hs * 0.3); + [[NSColor colorWithWhite:1.0 alpha:0.9] set]; [[NSBezierPath bezierPathWithOvalInRect:dot] fill]; } + // Edge midpoint handles — small bars to hint at edge dragging + CGFloat eh = 5.0; // half-thickness + CGFloat el = 32.0; // bar length + NSRect edges[4] = { + NSMakeRect(NSMidX(inner) - el/2, NSMaxY(inner) - eh/2, el, eh), // top + NSMakeRect(NSMidX(inner) - el/2, NSMinY(inner) - eh/2, el, eh), // bottom + NSMakeRect(NSMinX(inner) - eh/2, NSMidY(inner) - el/2, eh, el), // left + NSMakeRect(NSMaxX(inner) - eh/2, NSMidY(inner) - el/2, eh, el), // right + }; + for (int i = 0; i < 4; i++) { + NSBezierPath *ep = [NSBezierPath bezierPathWithRoundedRect:edges[i] + xRadius:2.5 yRadius:2.5]; + [[green colorWithAlphaComponent:0.7] set]; + [ep fill]; + } + // Label pill — centered at top NSString *label = @" RCLI Visual Mode "; NSDictionary *attrs = @{ - NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold], + NSFontAttributeName: [NSFont systemFontOfSize:12 weight:NSFontWeightHeavy], NSForegroundColorAttributeName: [NSColor blackColor], }; NSSize sz = [label sizeWithAttributes:attrs]; diff --git a/src/cli/main.cpp b/src/cli/main.cpp index 58cd4e1..a5c773d 100644 --- a/src/cli/main.cpp +++ b/src/cli/main.cpp @@ -486,9 +486,8 @@ static int cmd_vlm(const Args& args) { fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset); if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } @@ -548,9 +547,8 @@ static int cmd_camera(const Args& args) { if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } @@ -618,9 +616,8 @@ static int cmd_screen(const Args& args) { if (rcli_vlm_init(g_engine) != 0) { fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); - fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); - fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); - fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + fprintf(stderr, " No VLM model found. Download one:\n"); + fprintf(stderr, " %srcli models vlm%s\n\n", color::bold, color::reset); rcli_destroy(g_engine); return 1; } diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h index 7b01d1e..1f1f651 100644 --- a/src/cli/tui_app.h +++ b/src/cli/tui_app.h @@ -14,6 +14,7 @@ #include "engines/metalrt_loader.h" #include "engines/vlm_engine.h" #include "audio/camera_capture.h" +#include "audio/camera_preview.h" #include "audio/screen_capture.h" #include "models/vlm_model_registry.h" #include "core/log.h" @@ -439,9 +440,43 @@ class TuiApp { if (c == "r" || c == "R") { enter_rag_mode(); return true; } if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; } if (c == "p" || c == "P") { enter_personality_mode(); return true; } - // V key: capture photo from camera and analyze with VLM + // V key: toggle camera preview mode (live feed + auto VLM analysis) if (c == "v" || c == "V") { - run_camera_vlm("Describe what you see in this photo in detail."); + if (camera_preview_active()) { + add_system_message("Closing camera..."); + screen_->Post(Event::Custom); + stop_camera_auto_analysis(); + std::thread([this]() { + camera_preview_stop(); + rcli_vlm_exit(engine_); + add_system_message("Camera OFF"); + screen_->Post(Event::Custom); + }).detach(); + } else { + add_system_message("Opening camera, loading VLM..."); + screen_->Post(Event::Custom); + std::thread([this]() { + if (rcli_vlm_init(engine_) != 0) { + add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models"); + screen_->Post(Event::Custom); + return; + } + if (camera_preview_start() != 0) { + add_system_message("Camera preview failed. Check camera permissions in System Settings > Privacy & Security > Camera."); + screen_->Post(Event::Custom); + return; + } + const char* vbe = rcli_vlm_backend_name(engine_); + const char* vmodel = rcli_vlm_model_name(engine_); + std::string msg = "Camera LIVE"; + if (vbe && vbe[0]) + msg += std::string(" \xe2\x80\x94 ") + vmodel + " via " + vbe; + msg += ". Auto-analyzing every ~8s. Speak to ask a specific question"; + add_system_message(msg); + screen_->Post(Event::Custom); + start_camera_auto_analysis(); + }).detach(); + } return true; } // S key: toggle visual mode (VLM only on llama.cpp engine) @@ -581,6 +616,12 @@ class TuiApp { std::string user_text = transcript; add_user_message(user_text); + // Camera preview: route voice to camera VLM analysis + if (camera_preview_active()) { + run_camera_preview_vlm(user_text); + return; + } + // Visual mode: route voice to VLM screen analysis instead of LLM if (screen_capture_overlay_active()) { run_screen_vlm(user_text); @@ -1117,9 +1158,16 @@ class TuiApp { else right.push_back(text("[A] actions ") | dim); right.push_back(text("[C] convo ") | dim); - right.push_back(text("[V] camera ") | dim); + if (camera_preview_active()) { + if (cam_auto_busy_.load()) + right.push_back(text("[V] camera \xf0\x9f\x94\xb4 ") | ftxui::color(ftxui::Color::RedLight)); + else + right.push_back(text("[V] camera LIVE ") | ftxui::color(ftxui::Color::Green)); + } else { + right.push_back(text("[V] camera ") | dim); + } if (screen_capture_overlay_active()) - right.push_back(text("[S] visual ● ") | ftxui::color(ftxui::Color::Green)); + right.push_back(text("[S] visual \xe2\x97\x8f ") | ftxui::color(ftxui::Color::Green)); else right.push_back(text("[S] visual ") | dim); right.push_back(text("[R] RAG ") | dim); @@ -2234,6 +2282,118 @@ class TuiApp { // process_input // ==================================================================== + void start_camera_auto_analysis() { + cam_auto_running_.store(true); + cam_auto_busy_.store(false); + cam_auto_thread_ = std::thread([this]() { + // Small initial delay to let the camera warm up + for (int i = 0; i < 4 && cam_auto_running_.load(); i++) + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + while (cam_auto_running_.load()) { + if (!camera_preview_active()) break; + // Skip if voice/text analysis is in progress, check again in 500ms + if (cam_auto_busy_.load() || voice_state_.load() != VoiceState::IDLE) { + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + continue; + } + + cam_auto_busy_.store(true); + std::string photo_path = "/tmp/rcli_cam_auto_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + + if (camera_preview_snap(photo_path.c_str()) != 0) { + cam_auto_busy_.store(false); + continue; + } + + voice_state_ = VoiceState::THINKING; + screen_->Post(Event::Custom); + + std::string accumulated; + auto stream_cb = [](const char* event, const char* data, void* ud) { + auto* accum = static_cast(ud); + if (std::strcmp(event, "token") == 0) + accum->append(data); + }; + int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(), + "Briefly describe what you see. Focus on what's new or interesting. Be concise (1-2 sentences).", + stream_cb, &accumulated); + + if (vlm_rc == 0 && !accumulated.empty()) { + add_response(accumulated, "VLM \xf0\x9f\x93\xb7"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr); + } + + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + + // Brief cooldown after analysis before next cycle + for (int i = 0; i < 4 && cam_auto_running_.load(); i++) + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + }); + } + + void stop_camera_auto_analysis() { + cam_auto_running_.store(false); + if (cam_auto_thread_.joinable()) + cam_auto_thread_.join(); + } + + // User-initiated camera analysis (voice/text) — pauses auto, runs targeted query + void run_camera_preview_vlm(const std::string& prompt) { + cam_auto_busy_.store(true); + add_system_message("Analyzing camera feed..."); + voice_state_ = VoiceState::THINKING; + std::string prompt_copy = prompt; + std::thread([this, prompt_copy]() { + std::string photo_path = "/tmp/rcli_cam_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + + if (camera_preview_snap(photo_path.c_str()) != 0) { + add_response("(Camera capture failed.)", ""); + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + return; + } + + std::string accumulated; + auto stream_cb = [](const char* event, const char* data, void* ud) { + auto* accum = static_cast(ud); + if (std::strcmp(event, "token") == 0) { + accum->append(data); + } + }; + int vlm_rc = rcli_vlm_analyze_stream(engine_, photo_path.c_str(), + prompt_copy.c_str(), stream_cb, &accumulated); + + if (vlm_rc == 0 && !accumulated.empty()) { + add_response(accumulated, "VLM"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(engine_, &stats) == 0) { + char buf[128]; + snprintf(buf, sizeof(buf), "\xe2\x9a\xa1 %.1f tok/s | %d tokens | %.1fs total", + stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec); + add_system_message(buf); + } + } else { + add_response("(VLM analysis failed.)", ""); + } + + voice_state_ = VoiceState::IDLE; + cam_auto_busy_.store(false); + screen_->Post(Event::Custom); + }).detach(); + } + void run_camera_vlm(const std::string& prompt) { add_system_message("Capturing photo from camera..."); voice_state_ = VoiceState::THINKING; @@ -2383,6 +2543,9 @@ class TuiApp { add_system_message(" P Personality"); add_system_message(" R RAG panel"); add_system_message(" D Delete / cleanup models"); + add_system_message("--- Vision ---"); + add_system_message(" V Camera preview (toggle live feed + VLM)"); + add_system_message(" S Visual mode (screen overlay + VLM)"); add_system_message("--- Toggles ---"); add_system_message(" T Tool call trace (show tool calls & results)"); @@ -2421,7 +2584,10 @@ class TuiApp { } if (cmd == "camera" || cmd == "photo" || cmd == "webcam") { - run_camera_vlm("Describe what you see in this photo in detail."); + if (camera_preview_active()) + run_camera_preview_vlm("Describe what you see in this photo in detail."); + else + run_camera_vlm("Describe what you see in this photo in detail."); return; } @@ -2607,6 +2773,12 @@ class TuiApp { } } + // Camera preview active: route typed questions to camera VLM + if (camera_preview_active()) { + run_camera_preview_vlm(input); + return; + } + // Run LLM (or RAG+LLM) in background thread to keep UI responsive voice_state_ = VoiceState::THINKING; std::string input_copy = input; @@ -2865,6 +3037,12 @@ class TuiApp { ftxui::Color personality_msg_color_; + // Camera auto-analysis state + std::atomic cam_auto_running_{false}; + std::atomic cam_auto_busy_{false}; + std::thread cam_auto_thread_; + std::string cam_last_snap_path_; + // RAG panel state struct RagOption { std::string name, action; }; bool rag_mode_ = false; diff --git a/src/engines/metalrt_vlm_engine.cpp b/src/engines/metalrt_vlm_engine.cpp new file mode 100644 index 0000000..b1e30ec --- /dev/null +++ b/src/engines/metalrt_vlm_engine.cpp @@ -0,0 +1,256 @@ +#include "engines/metalrt_vlm_engine.h" +#include "core/log.h" +#include +#include + +namespace rastack { + +bool MetalRTVlmEngine::init(const MetalRTVlmConfig& config) { + auto& loader = MetalRTLoader::instance(); + if (!loader.is_loaded() && !loader.load()) { + LOG_ERROR("MetalRT-VLM", "dylib not loaded"); + return false; + } + + if (!loader.has_vision()) { + LOG_WARN("MetalRT-VLM", "Vision symbols not available in dylib — " + "create=%p analyze=%p", + (void*)loader.vision_create, (void*)loader.vision_analyze); + return false; + } + + LOG_DEBUG("MetalRT-VLM", "Creating VLM instance via Metal GPU..."); + auto t_start = std::chrono::high_resolution_clock::now(); + + handle_ = loader.vision_create(); + if (!handle_) { + LOG_ERROR("MetalRT-VLM", "Failed to create VLM instance"); + return false; + } + + LOG_DEBUG("MetalRT-VLM", "Loading model from %s ...", config.model_dir.c_str()); + if (!loader.vision_load(handle_, config.model_dir.c_str())) { + LOG_ERROR("MetalRT-VLM", "Failed to load model from %s", config.model_dir.c_str()); + loader.vision_destroy(handle_); + handle_ = nullptr; + return false; + } + + config_ = config; + + auto t_end = std::chrono::high_resolution_clock::now(); + double init_ms = std::chrono::duration(t_end - t_start).count(); + + initialized_ = true; + + const char* mname = loader.vision_model_name ? loader.vision_model_name(handle_) : ""; + const char* dname = loader.vision_device_name ? loader.vision_device_name(handle_) : ""; + + LOG_DEBUG("MetalRT-VLM", "=== MetalRT VLM GPU VERIFICATION ==="); + LOG_DEBUG("MetalRT-VLM", " Engine: VLM via libmetalrt.dylib (Metal GPU)"); + LOG_DEBUG("MetalRT-VLM", " Model dir: %s", config.model_dir.c_str()); + LOG_DEBUG("MetalRT-VLM", " Model: %s", mname); + LOG_DEBUG("MetalRT-VLM", " Device: %s", dname); + LOG_DEBUG("MetalRT-VLM", " Init time: %.1f ms", init_ms); + return true; +} + +void MetalRTVlmEngine::shutdown() { + if (handle_) { + auto& loader = MetalRTLoader::instance(); + if (loader.vision_destroy) { + loader.vision_destroy(handle_); + } + handle_ = nullptr; + } + initialized_ = false; + stats_ = {}; +} + +void MetalRTVlmEngine::reset() { + if (!initialized_ || !handle_) return; + auto& loader = MetalRTLoader::instance(); + if (loader.vision_reset) { + std::lock_guard gpu_lock(loader.gpu_mutex()); + loader.vision_reset(handle_); + } +} + +std::string MetalRTVlmEngine::analyze_image(const std::string& image_path, + const std::string& prompt) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + + LOG_DEBUG("MetalRT-VLM", "analyze_image() → Metal GPU | image=%s prompt=%zu chars", + image_path.c_str(), prompt.size()); + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + auto wall_start = std::chrono::high_resolution_clock::now(); + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_analyze(handle_, image_path.c_str(), prompt.c_str(), &opts); + } + auto wall_end = std::chrono::high_resolution_clock::now(); + double wall_ms = std::chrono::duration(wall_end - wall_start).count(); + + // Store stats + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + LOG_DEBUG("MetalRT-VLM", "=== VLM ANALYSIS [Metal GPU] ==="); + LOG_DEBUG("MetalRT-VLM", " Vision encode: %.1f ms", result.vision_encode_ms); + LOG_DEBUG("MetalRT-VLM", " Prefill: %.1f ms (%d tokens)", result.prefill_ms, result.prompt_tokens); + LOG_DEBUG("MetalRT-VLM", " Decode: %.1f ms (%d tokens)", result.decode_ms, result.generated_tokens); + LOG_DEBUG("MetalRT-VLM", " TPS: %.1f tok/s", result.tps); + LOG_DEBUG("MetalRT-VLM", " Wall time: %.1f ms", wall_ms); + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::analyze_image_stream(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_analyze_stream) { + // Fall back to non-streaming + return analyze_image(image_path, prompt); + } + + LOG_DEBUG("MetalRT-VLM", "analyze_image_stream() → Metal GPU | image=%s", image_path.c_str()); + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + // Bridge TokenCallback to MetalRTStreamCb + struct StreamCtx { + TokenCallback cb; + }; + StreamCtx ctx{on_token}; + + MetalRTStreamCb stream_cb = nullptr; + if (on_token) { + stream_cb = [](const char* piece, void* user_data) -> bool { + auto* sctx = static_cast(user_data); + if (sctx->cb) { + TokenOutput tok; + tok.text = piece; + sctx->cb(tok); + } + return true; // continue generation + }; + } + + auto wall_start = std::chrono::high_resolution_clock::now(); + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_analyze_stream(handle_, image_path.c_str(), prompt.c_str(), + stream_cb, &ctx, &opts); + } + auto wall_end = std::chrono::high_resolution_clock::now(); + double wall_ms = std::chrono::duration(wall_end - wall_start).count(); + + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + LOG_DEBUG("MetalRT-VLM", " Stream complete: %.1f ms, %d tokens, %.1f tok/s", + wall_ms, result.generated_tokens, result.tps); + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::generate(const std::string& prompt) { + if (!initialized_ || !handle_) return ""; + + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_generate) return ""; + + MetalRTLoader::MetalRTVisionOptions opts = {}; + opts.max_tokens = config_.max_tokens; + opts.top_k = config_.top_k; + opts.temperature = config_.temperature; + opts.think = false; + + MetalRTLoader::MetalRTVisionResult result; + { + std::lock_guard gpu_lock(loader.gpu_mutex()); + result = loader.vision_generate(handle_, prompt.c_str(), &opts); + } + + stats_.vision_encode_ms = result.vision_encode_ms; + stats_.prefill_ms = result.prefill_ms; + stats_.decode_ms = result.decode_ms; + stats_.tps = result.tps; + stats_.prompt_tokens = result.prompt_tokens; + stats_.generated_tokens = result.generated_tokens; + + std::string text; + if (result.response && result.response[0]) { + text = result.response; + } else if (result.text && result.text[0]) { + text = result.text; + } + + if (loader.vision_free_result) + loader.vision_free_result(result); + + return text; +} + +std::string MetalRTVlmEngine::model_name() const { + if (!initialized_ || !handle_) return ""; + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_model_name) return ""; + const char* name = loader.vision_model_name(handle_); + return name ? name : ""; +} + +std::string MetalRTVlmEngine::device_name() const { + if (!initialized_ || !handle_) return ""; + auto& loader = MetalRTLoader::instance(); + if (!loader.vision_device_name) return ""; + const char* name = loader.vision_device_name(handle_); + return name ? name : ""; +} + +} // namespace rastack \ No newline at end of file diff --git a/src/engines/metalrt_vlm_engine.h b/src/engines/metalrt_vlm_engine.h new file mode 100644 index 0000000..1dee0cb --- /dev/null +++ b/src/engines/metalrt_vlm_engine.h @@ -0,0 +1,64 @@ +#pragma once + +#include "engines/metalrt_loader.h" +#include "core/types.h" +#include +#include +#include + +namespace rastack { + +struct MetalRTVlmConfig { + std::string model_dir; + int max_tokens = 512; + int top_k = 40; + float temperature = 0.7f; +}; + +struct MetalRTVlmStats { + double vision_encode_ms = 0; + double prefill_ms = 0; + double decode_ms = 0; + double tps = 0; + int prompt_tokens = 0; + int generated_tokens = 0; +}; + +class MetalRTVlmEngine { +public: + MetalRTVlmEngine() = default; + ~MetalRTVlmEngine() { shutdown(); } + + MetalRTVlmEngine(const MetalRTVlmEngine&) = delete; + MetalRTVlmEngine& operator=(const MetalRTVlmEngine&) = delete; + + bool init(const MetalRTVlmConfig& config); + void shutdown(); + void reset(); + + // Analyze an image with a text prompt (blocking) + std::string analyze_image(const std::string& image_path, + const std::string& prompt); + + // Analyze with streaming token callback + std::string analyze_image_stream(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token); + + // Text-only generation (follow-up without new image) + std::string generate(const std::string& prompt); + + std::string model_name() const; + std::string device_name() const; + + bool is_initialized() const { return initialized_; } + const MetalRTVlmStats& last_stats() const { return stats_; } + +private: + void* handle_ = nullptr; + MetalRTVlmConfig config_; + MetalRTVlmStats stats_; + bool initialized_ = false; +}; + +} // namespace rastack \ No newline at end of file