From c6b312c6e5d10091d7fd85e0ae406041e56de4c2 Mon Sep 17 00:00:00 2001 From: James Casey Date: Sat, 28 Feb 2026 14:00:06 +0000 Subject: [PATCH] feat: workspace startup tombstone to detect and clean partial starts BT-969 - Write `starting` tombstone before spawning BEAM node in start_detached_node - Remove tombstone on successful startup completion - Detect tombstone on entry and clean all stale runtime files via remove_stale_runtime_files - Add `starting` to remove_stale_runtime_files (alongside node.info, port, pid) - Add integration test: simulate mid-startup crash, verify next start succeeds - Add unit tests for remove_stale_runtime_files covering tombstone and idempotency Co-Authored-By: Claude Sonnet 4.6 --- .../src/commands/workspace/mod.rs | 93 +++++++++++++++++++ .../src/commands/workspace/process.rs | 27 +++++- .../src/commands/workspace/storage.rs | 55 ++++++++++- 3 files changed, 168 insertions(+), 7 deletions(-) diff --git a/crates/beamtalk-cli/src/commands/workspace/mod.rs b/crates/beamtalk-cli/src/commands/workspace/mod.rs index 2ef54eb93..07654c7d8 100644 --- a/crates/beamtalk-cli/src/commands/workspace/mod.rs +++ b/crates/beamtalk-cli/src/commands/workspace/mod.rs @@ -1280,6 +1280,99 @@ mod tests { ); } + /// Regression test for BT-969: a `starting` tombstone left by a mid-startup + /// crash is detected on the next `start_detached_node` call, all stale runtime + /// files are cleaned up, and the node starts successfully. + /// + /// Scenario: + /// 1. A previous node started successfully (port/pid/node.info written). + /// 2. After that run, we force-kill the node without cleanup. + /// 3. We manually write a `starting` tombstone to simulate a partial startup + /// (i.e. as if a future startup crashed before completing). + /// 4. `start_detached_node` is called again — it must detect `starting`, + /// clean all stale runtime files, and start successfully. + #[test] + #[cfg(unix)] + #[ignore = "integration test — requires Erlang/OTP runtime"] + #[serial(workspace_integration)] + fn test_start_detached_node_cleans_up_tombstone_integration() { + let tw = TestWorkspace::new("integ_tombstone"); + let project_path = std::env::current_dir().unwrap(); + let _ = create_workspace(&project_path, Some(&tw.id)).unwrap(); + + let (runtime, workspace_dir_beam, jsx, compiler, stdlib, extra_paths) = + beam_dirs_for_tests(); + + // Step 1: Start a node to produce real runtime files, then kill it. + let first_info = start_detached_node( + &tw.id, + 0, + &runtime, + &workspace_dir_beam, + &jsx, + &compiler, + &stdlib, + &extra_paths, + false, + Some(60), + None, + None, + None, + ) + .expect("first start should succeed"); + kill_node_raw(&first_info); + wait_for_epmd_deregistration(&first_info.node_name, 5) + .expect("epmd should deregister node name within timeout"); + + // Step 2: Write a `starting` tombstone to simulate a partial startup that + // was interrupted before the node finished initialising. + let ws_dir = workspace_dir(&tw.id).unwrap(); + std::fs::write(ws_dir.join("starting"), b"") + .expect("should be able to write starting tombstone"); + + // Verify stale runtime files exist (port, pid written by first run). + assert!( + ws_dir.join("pid").exists(), + "stale pid file should exist after kill" + ); + + // Step 3: Start a new node. The tombstone must be detected, all stale + // files cleaned, and startup must succeed. + let (new_info, started, _) = get_or_start_workspace( + &project_path, + Some(&tw.id), + 0, + &runtime, + &workspace_dir_beam, + &jsx, + &compiler, + &stdlib, + &extra_paths, + false, + Some(60), + None, + None, + None, + ) + .expect("restart with tombstone present should succeed"); + let _guard = NodeGuard { pid: new_info.pid }; + + assert!(started, "should have started a new node"); + assert_ne!( + new_info.pid, first_info.pid, + "new node should have a different PID" + ); + assert!( + is_node_running(&new_info), + "new node should be running and reachable" + ); + // Tombstone must be gone after successful startup. + assert!( + !ws_dir.join("starting").exists(), + "starting tombstone should be removed after successful startup" + ); + } + #[test] #[cfg(unix)] #[ignore = "integration test — requires Erlang/OTP runtime"] diff --git a/crates/beamtalk-cli/src/commands/workspace/process.rs b/crates/beamtalk-cli/src/commands/workspace/process.rs index 6454b8848..3ae4330c6 100644 --- a/crates/beamtalk-cli/src/commands/workspace/process.rs +++ b/crates/beamtalk-cli/src/commands/workspace/process.rs @@ -28,7 +28,7 @@ use crate::commands::protocol::ProtocolClient; #[cfg(target_os = "linux")] use super::storage::read_proc_start_time; use super::storage::{ - NodeInfo, get_workspace_metadata, read_port_file, read_workspace_cookie, + NodeInfo, get_workspace_metadata, read_port_file, read_workspace_cookie, remove_file_if_exists, remove_stale_runtime_files, save_node_info, workspace_dir, }; @@ -245,11 +245,24 @@ pub fn start_detached_node( #[cfg(windows)] let project_path_str = project_path_str.replace('\\', "\\\\"); - // Remove stale runtime files (pid, port, node.info) from any previous run. - // Without this, the port/PID discovery loops may read stale values and connect - // to the wrong BEAM node, causing auth failures in wait_for_tcp_ready. + // If a `starting` tombstone is present, a previous startup was interrupted + // mid-flight (crash, OOM, SIGKILL, etc.). Clean up all runtime files — + // including the tombstone itself — before attempting a fresh start (BT-969). + // `remove_stale_runtime_files` also handles the BT-967 case (stale port/pid/ + // node.info from a previous aborted run without a tombstone). remove_stale_runtime_files(workspace_id)?; + // Write the tombstone before spawning the BEAM node. If startup is + // interrupted at any point after this, the next call will detect the file + // and trigger the cleanup above (BT-969). + let tombstone_path = workspace_dir(workspace_id)?.join("starting"); + std::fs::write(&tombstone_path, b"").map_err(|e| { + miette!( + "Failed to write startup tombstone {}: {e}", + tombstone_path.display() + ) + })?; + // Compute the PID file path so the BEAM node can write its own PID for reliable discovery. // This avoids flaky process-list scanning via sysinfo (which can miss newly-forked processes // on loaded CI runners). The workspace dir is guaranteed to exist before we start the node. @@ -401,6 +414,12 @@ pub fn start_detached_node( // Save node info save_node_info(workspace_id, &node_info)?; + // Remove the tombstone now that startup completed successfully (BT-969). + // On any failure path the tombstone is deliberately left in place so the + // next `start_detached_node` call can detect and clean up the partial state. + let tombstone_path = workspace_dir(workspace_id)?.join("starting"); + remove_file_if_exists(&tombstone_path)?; + Ok(node_info) } diff --git a/crates/beamtalk-cli/src/commands/workspace/storage.rs b/crates/beamtalk-cli/src/commands/workspace/storage.rs index 9a338055d..e79508a3c 100644 --- a/crates/beamtalk-cli/src/commands/workspace/storage.rs +++ b/crates/beamtalk-cli/src/commands/workspace/storage.rs @@ -277,7 +277,7 @@ pub(super) fn read_proc_start_time(pid: u32) -> Option { /// /// Used by stale-file cleanup to avoid TOCTOU races (`exists()` + `remove_file()` /// is racy if another process creates/removes the file between the two calls). -fn remove_file_if_exists(path: &std::path::Path) -> Result<()> { +pub(super) fn remove_file_if_exists(path: &std::path::Path) -> Result<()> { if let Err(err) = fs::remove_file(path) { if err.kind() != std::io::ErrorKind::NotFound { return Err(miette!( @@ -289,12 +289,14 @@ fn remove_file_if_exists(path: &std::path::Path) -> Result<()> { Ok(()) } -/// Remove stale runtime files (`node.info`, `port`, `pid`) from a workspace directory. +/// Remove stale runtime files (`node.info`, `port`, `pid`, `starting`) from a workspace directory. /// /// These files are written during workspace startup and must be cleaned before /// restarting a node — a stale `port` file can cause `start_detached_node` to /// connect to the wrong BEAM node, and a stale `pid` file can report the wrong -/// process. +/// process. The `starting` tombstone is written at the very beginning of +/// `start_detached_node` and removed on success; its presence here indicates a +/// partial startup that was interrupted (BT-969). /// /// Called by: /// - `cleanup_stale_node_info` — after detecting an orphaned workspace @@ -304,6 +306,7 @@ pub fn remove_stale_runtime_files(workspace_id: &str) -> Result<()> { remove_file_if_exists(&ws_dir.join("node.info"))?; remove_file_if_exists(&ws_dir.join("port"))?; remove_file_if_exists(&ws_dir.join("pid"))?; + remove_file_if_exists(&ws_dir.join("starting"))?; Ok(()) } @@ -334,3 +337,49 @@ pub(super) fn acquire_workspace_lock(workspace_id: &str) -> Result { lockfile.lock_exclusive().into_diagnostic()?; Ok(lockfile) } + +#[cfg(test)] +mod tests { + use super::*; + + /// Verify that `remove_stale_runtime_files` removes all known runtime files + /// including the `starting` tombstone introduced in BT-969. + #[test] + fn test_remove_stale_runtime_files_removes_starting_tombstone() { + let ws_id = format!("test_stale_tombstone_{}", std::process::id()); + let ws_dir = workspaces_base_dir().unwrap().join(&ws_id); + fs::create_dir_all(&ws_dir).unwrap(); + + // Write all runtime files including the tombstone. + for name in &["node.info", "port", "pid", "starting"] { + fs::write(ws_dir.join(name), b"dummy").unwrap(); + } + + remove_stale_runtime_files(&ws_id).expect("remove_stale_runtime_files should not fail"); + + for name in &["node.info", "port", "pid", "starting"] { + assert!( + !ws_dir.join(name).exists(), + "{name} should have been removed by remove_stale_runtime_files" + ); + } + + // Cleanup + let _ = fs::remove_dir_all(&ws_dir); + } + + /// Verify that `remove_stale_runtime_files` is idempotent — calling it when + /// the files are already absent (e.g. after a clean shutdown) must not error. + #[test] + fn test_remove_stale_runtime_files_idempotent_on_missing_files() { + let ws_id = format!("test_stale_idempotent_{}", std::process::id()); + let ws_dir = workspaces_base_dir().unwrap().join(&ws_id); + fs::create_dir_all(&ws_dir).unwrap(); + + // No files written — must succeed without error. + remove_stale_runtime_files(&ws_id) + .expect("remove_stale_runtime_files should be idempotent on missing files"); + + let _ = fs::remove_dir_all(&ws_dir); + } +}