From f6b0696f81ba52ad46779f3a9700ea655bd85720 Mon Sep 17 00:00:00 2001 From: James Casey Date: Sat, 28 Feb 2026 17:25:00 +0000 Subject: [PATCH 1/2] chore: increase workspace readiness timeout to 60s and add startup log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Double READINESS_PROBE_MAX_RETRIES from 150 to 300 (30s → 60s budget) to handle loaded CI runners after workspace integration tests - Redirect BEAM node stderr to {workspace_dir}/startup.log instead of /dev/null, giving crash reports and OTP error_logger output on failure - Distinguish crash vs slow-start in the timeout error: check PID liveness via sysinfo and report actionable guidance + log file path in both cases Fixes intermittent MCP integration test failures in CI (run 22522874219) where the 30s readiness budget was exhausted after 12 BEAM nodes were started/killed by the workspace integration tests running immediately before. Co-Authored-By: Claude Sonnet 4.6 --- .../src/commands/workspace/process.rs | 73 +++++++++++++++++-- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/crates/beamtalk-cli/src/commands/workspace/process.rs b/crates/beamtalk-cli/src/commands/workspace/process.rs index 3ae4330c6..54bb6e03f 100644 --- a/crates/beamtalk-cli/src/commands/workspace/process.rs +++ b/crates/beamtalk-cli/src/commands/workspace/process.rs @@ -68,8 +68,12 @@ const READINESS_PROBE_DELAY_MS: u64 = 200; /// /// When the port is not yet bound, `connect()` returns ECONNREFUSED immediately, /// so the real budget is RETRIES × `DELAY_MS` (not RETRIES × `connect_timeout`). -/// 150 × 200ms = 30s budget for the BEAM node to start listening. -const READINESS_PROBE_MAX_RETRIES: usize = 150; +/// 300 × 200ms = 60s budget for the BEAM node to start listening. +/// +/// 60s is needed because CI runners can be heavily loaded immediately after the +/// workspace integration tests run (which start/stop 12 BEAM nodes), slowing +/// workspace startup enough to exceed the previous 30s budget. +const READINESS_PROBE_MAX_RETRIES: usize = 300; /// TCP read timeout for readiness probe in milliseconds. const READINESS_READ_TIMEOUT_MS: u64 = 500; @@ -305,6 +309,10 @@ pub fn start_detached_node( // Write cookie to args file (BT-726: not visible in `ps aux`) let cookie_args_file = write_cookie_args_file(workspace_id, &cookie)?; + // Redirect BEAM node stderr to a log file for crash diagnostics. + // On startup failure the log will contain OTP crash reports or error_logger output. + let startup_log_path = workspace_dir(workspace_id)?.join("startup.log"); + // Start detached BEAM node let mut cmd = build_detached_node_command( &node_name, @@ -320,6 +328,17 @@ pub fn start_detached_node( ssl_dist_optfile, ); + // Override the default /dev/null stderr with a workspace log file. + // Errors opening the log file are non-fatal — we fall back to /dev/null. + if let Ok(log_file) = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&startup_log_path) + { + cmd.stderr(Stdio::from(log_file)); + } + let child = cmd.spawn().map_err(|e| { miette!("Failed to start detached BEAM node: {e}\nIs Erlang/OTP installed?") })?; @@ -409,7 +428,13 @@ pub fn start_detached_node( }; // Wait for WebSocket health endpoint to be fully ready before returning. - wait_for_tcp_ready(node_info.connect_host(), actual_port, pid, &cookie)?; + wait_for_tcp_ready( + node_info.connect_host(), + actual_port, + pid, + &cookie, + &startup_log_path, + )?; // Save node info save_node_info(workspace_id, &node_info)?; @@ -441,7 +466,14 @@ fn path_to_erlang_arg(path: &Path) -> String { /// Poll until the WebSocket health endpoint responds on the given port. /// /// Uses short per-attempt timeouts to keep the worst-case total bounded. -fn wait_for_tcp_ready(host: &str, port: u16, pid: u32, cookie: &str) -> Result<()> { +/// `log_path` is included in timeout errors to help diagnose failures. +fn wait_for_tcp_ready( + host: &str, + port: u16, + pid: u32, + cookie: &str, + log_path: &std::path::Path, +) -> Result<()> { for _ in 0..READINESS_PROBE_MAX_RETRIES { if let Ok(mut client) = ProtocolClient::connect( host, @@ -456,10 +488,35 @@ fn wait_for_tcp_ready(host: &str, port: u16, pid: u32, cookie: &str) -> Result<( } std::thread::sleep(Duration::from_millis(READINESS_PROBE_DELAY_MS)); } - Err(miette!( - "BEAM node started (PID {pid}) but WebSocket health endpoint on port {port} \ - did not become ready. The workspace may have failed to initialize." - )) + + // Check whether the BEAM node is still alive to give a more actionable error. + // If the process is gone it crashed during startup; if it's still running + // it is either stuck initializing or the WebSocket stack failed to come up. + let node_alive = { + use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System}; + let mut system = System::new(); + system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[Pid::from_u32(pid)]), + true, + ProcessRefreshKind::nothing(), + ); + system.process(Pid::from_u32(pid)).is_some() + }; + + let log_hint = log_path.display(); + if node_alive { + Err(miette!( + "BEAM node started (PID {pid}) but WebSocket health endpoint on port {port} \ + did not become ready within the timeout. The workspace may be initializing \ + slowly — try again, or check {log_hint} for startup logs." + )) + } else { + Err(miette!( + "BEAM node (PID {pid}) crashed during startup before WebSocket endpoint \ + on port {port} became ready. Check {log_hint} for crash details and ensure \ + Erlang/OTP is installed correctly." + )) + } } /// Write an Erlang args file containing the cookie for secure distribution. From 8e37dbe5b2959cfd9e1232781a431c47ad6621f2 Mon Sep 17 00:00:00 2001 From: James Casey Date: Sat, 28 Feb 2026 17:38:54 +0000 Subject: [PATCH 2/2] fix: only mention startup.log in errors when log file was opened Per CodeRabbit review: if the startup log file could not be opened we previously unconditionally told users to check it. Now we track whether the open succeeded (startup_log_enabled) and pass an Option<&Path> to wait_for_tcp_ready. Error messages only include the log-file hint when Some(path) is present; a warning is printed when the file cannot be opened so the fallback is visible. Co-Authored-By: Claude Sonnet 4.6 --- .../src/commands/workspace/process.rs | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/crates/beamtalk-cli/src/commands/workspace/process.rs b/crates/beamtalk-cli/src/commands/workspace/process.rs index 54bb6e03f..1055b3c47 100644 --- a/crates/beamtalk-cli/src/commands/workspace/process.rs +++ b/crates/beamtalk-cli/src/commands/workspace/process.rs @@ -330,14 +330,24 @@ pub fn start_detached_node( // Override the default /dev/null stderr with a workspace log file. // Errors opening the log file are non-fatal — we fall back to /dev/null. - if let Ok(log_file) = std::fs::OpenOptions::new() + let startup_log_enabled = match std::fs::OpenOptions::new() .create(true) .write(true) .truncate(true) .open(&startup_log_path) { - cmd.stderr(Stdio::from(log_file)); - } + Ok(log_file) => { + cmd.stderr(Stdio::from(log_file)); + true + } + Err(e) => { + eprintln!( + "Warning: could not open startup log {}: {e}", + startup_log_path.display() + ); + false + } + }; let child = cmd.spawn().map_err(|e| { miette!("Failed to start detached BEAM node: {e}\nIs Erlang/OTP installed?") @@ -433,7 +443,7 @@ pub fn start_detached_node( actual_port, pid, &cookie, - &startup_log_path, + startup_log_enabled.then_some(startup_log_path.as_path()), )?; // Save node info @@ -466,13 +476,14 @@ fn path_to_erlang_arg(path: &Path) -> String { /// Poll until the WebSocket health endpoint responds on the given port. /// /// Uses short per-attempt timeouts to keep the worst-case total bounded. -/// `log_path` is included in timeout errors to help diagnose failures. +/// `log_path` is `Some(path)` only when the startup log file was successfully +/// opened; it is included in timeout error messages to guide diagnosis. fn wait_for_tcp_ready( host: &str, port: u16, pid: u32, cookie: &str, - log_path: &std::path::Path, + log_path: Option<&std::path::Path>, ) -> Result<()> { for _ in 0..READINESS_PROBE_MAX_RETRIES { if let Ok(mut client) = ProtocolClient::connect( @@ -503,18 +514,21 @@ fn wait_for_tcp_ready( system.process(Pid::from_u32(pid)).is_some() }; - let log_hint = log_path.display(); + // Only mention the log file if we actually opened it (log_path is Some). + let log_suffix = log_path + .map(|p| format!(" Check {} for startup logs.", p.display())) + .unwrap_or_default(); + if node_alive { Err(miette!( "BEAM node started (PID {pid}) but WebSocket health endpoint on port {port} \ did not become ready within the timeout. The workspace may be initializing \ - slowly — try again, or check {log_hint} for startup logs." + slowly — try again.{log_suffix}" )) } else { Err(miette!( "BEAM node (PID {pid}) crashed during startup before WebSocket endpoint \ - on port {port} became ready. Check {log_hint} for crash details and ensure \ - Erlang/OTP is installed correctly." + on port {port} became ready. Ensure Erlang/OTP is installed correctly.{log_suffix}" )) } }