Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions crates/beamtalk-cli/src/commands/workspace/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,99 @@ mod tests {
);
}

/// Regression test for BT-969: a `starting` tombstone left by a mid-startup
/// crash is detected on the next `start_detached_node` call, all stale runtime
/// files are cleaned up, and the node starts successfully.
///
/// Scenario:
/// 1. A previous node started successfully (port/pid/node.info written).
/// 2. After that run, we force-kill the node without cleanup.
/// 3. We manually write a `starting` tombstone to simulate a partial startup
/// (i.e. as if a future startup crashed before completing).
/// 4. `start_detached_node` is called again — it must detect `starting`,
/// clean all stale runtime files, and start successfully.
#[test]
#[cfg(unix)]
#[ignore = "integration test — requires Erlang/OTP runtime"]
#[serial(workspace_integration)]
fn test_start_detached_node_cleans_up_tombstone_integration() {
let tw = TestWorkspace::new("integ_tombstone");
let project_path = std::env::current_dir().unwrap();
let _ = create_workspace(&project_path, Some(&tw.id)).unwrap();

let (runtime, workspace_dir_beam, jsx, compiler, stdlib, extra_paths) =
beam_dirs_for_tests();

// Step 1: Start a node to produce real runtime files, then kill it.
let first_info = start_detached_node(
&tw.id,
0,
&runtime,
&workspace_dir_beam,
&jsx,
&compiler,
&stdlib,
&extra_paths,
false,
Some(60),
None,
None,
None,
)
.expect("first start should succeed");
kill_node_raw(&first_info);
wait_for_epmd_deregistration(&first_info.node_name, 5)
.expect("epmd should deregister node name within timeout");

// Step 2: Write a `starting` tombstone to simulate a partial startup that
// was interrupted before the node finished initialising.
let ws_dir = workspace_dir(&tw.id).unwrap();
std::fs::write(ws_dir.join("starting"), b"")
.expect("should be able to write starting tombstone");

// Verify stale runtime files exist (port, pid written by first run).
assert!(
ws_dir.join("pid").exists(),
"stale pid file should exist after kill"
);

// Step 3: Start a new node. The tombstone must be detected, all stale
// files cleaned, and startup must succeed.
let (new_info, started, _) = get_or_start_workspace(
&project_path,
Some(&tw.id),
0,
&runtime,
&workspace_dir_beam,
&jsx,
&compiler,
&stdlib,
&extra_paths,
false,
Some(60),
None,
None,
None,
)
.expect("restart with tombstone present should succeed");
let _guard = NodeGuard { pid: new_info.pid };

assert!(started, "should have started a new node");
assert_ne!(
new_info.pid, first_info.pid,
"new node should have a different PID"
);
assert!(
is_node_running(&new_info),
"new node should be running and reachable"
);
// Tombstone must be gone after successful startup.
assert!(
!ws_dir.join("starting").exists(),
"starting tombstone should be removed after successful startup"
);
}

#[test]
#[cfg(unix)]
#[ignore = "integration test — requires Erlang/OTP runtime"]
Expand Down
27 changes: 23 additions & 4 deletions crates/beamtalk-cli/src/commands/workspace/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use crate::commands::protocol::ProtocolClient;
#[cfg(target_os = "linux")]
use super::storage::read_proc_start_time;
use super::storage::{
NodeInfo, get_workspace_metadata, read_port_file, read_workspace_cookie,
NodeInfo, get_workspace_metadata, read_port_file, read_workspace_cookie, remove_file_if_exists,
remove_stale_runtime_files, save_node_info, workspace_dir,
};

Expand Down Expand Up @@ -245,11 +245,24 @@ pub fn start_detached_node(
#[cfg(windows)]
let project_path_str = project_path_str.replace('\\', "\\\\");

// Remove stale runtime files (pid, port, node.info) from any previous run.
// Without this, the port/PID discovery loops may read stale values and connect
// to the wrong BEAM node, causing auth failures in wait_for_tcp_ready.
// If a `starting` tombstone is present, a previous startup was interrupted
// mid-flight (crash, OOM, SIGKILL, etc.). Clean up all runtime files —
// including the tombstone itself — before attempting a fresh start (BT-969).
// `remove_stale_runtime_files` also handles the BT-967 case (stale port/pid/
// node.info from a previous aborted run without a tombstone).
remove_stale_runtime_files(workspace_id)?;

// Write the tombstone before spawning the BEAM node. If startup is
// interrupted at any point after this, the next call will detect the file
// and trigger the cleanup above (BT-969).
let tombstone_path = workspace_dir(workspace_id)?.join("starting");
std::fs::write(&tombstone_path, b"").map_err(|e| {
miette!(
"Failed to write startup tombstone {}: {e}",
tombstone_path.display()
)
})?;

// Compute the PID file path so the BEAM node can write its own PID for reliable discovery.
// This avoids flaky process-list scanning via sysinfo (which can miss newly-forked processes
// on loaded CI runners). The workspace dir is guaranteed to exist before we start the node.
Expand Down Expand Up @@ -401,6 +414,12 @@ pub fn start_detached_node(
// Save node info
save_node_info(workspace_id, &node_info)?;

// Remove the tombstone now that startup completed successfully (BT-969).
// On any failure path the tombstone is deliberately left in place so the
// next `start_detached_node` call can detect and clean up the partial state.
let tombstone_path = workspace_dir(workspace_id)?.join("starting");
remove_file_if_exists(&tombstone_path)?;

Ok(node_info)
}

Expand Down
55 changes: 52 additions & 3 deletions crates/beamtalk-cli/src/commands/workspace/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ pub(super) fn read_proc_start_time(pid: u32) -> Option<u64> {
///
/// Used by stale-file cleanup to avoid TOCTOU races (`exists()` + `remove_file()`
/// is racy if another process creates/removes the file between the two calls).
fn remove_file_if_exists(path: &std::path::Path) -> Result<()> {
pub(super) fn remove_file_if_exists(path: &std::path::Path) -> Result<()> {
if let Err(err) = fs::remove_file(path) {
if err.kind() != std::io::ErrorKind::NotFound {
return Err(miette!(
Expand All @@ -289,12 +289,14 @@ fn remove_file_if_exists(path: &std::path::Path) -> Result<()> {
Ok(())
}

/// Remove stale runtime files (`node.info`, `port`, `pid`) from a workspace directory.
/// Remove stale runtime files (`node.info`, `port`, `pid`, `starting`) from a workspace directory.
///
/// These files are written during workspace startup and must be cleaned before
/// restarting a node — a stale `port` file can cause `start_detached_node` to
/// connect to the wrong BEAM node, and a stale `pid` file can report the wrong
/// process.
/// process. The `starting` tombstone is written at the very beginning of
/// `start_detached_node` and removed on success; its presence here indicates a
/// partial startup that was interrupted (BT-969).
///
/// Called by:
/// - `cleanup_stale_node_info` — after detecting an orphaned workspace
Expand All @@ -304,6 +306,7 @@ pub fn remove_stale_runtime_files(workspace_id: &str) -> Result<()> {
remove_file_if_exists(&ws_dir.join("node.info"))?;
remove_file_if_exists(&ws_dir.join("port"))?;
remove_file_if_exists(&ws_dir.join("pid"))?;
remove_file_if_exists(&ws_dir.join("starting"))?;
Ok(())
}

Expand Down Expand Up @@ -334,3 +337,49 @@ pub(super) fn acquire_workspace_lock(workspace_id: &str) -> Result<fs::File> {
lockfile.lock_exclusive().into_diagnostic()?;
Ok(lockfile)
}

#[cfg(test)]
mod tests {
use super::*;

/// Verify that `remove_stale_runtime_files` removes all known runtime files
/// including the `starting` tombstone introduced in BT-969.
#[test]
fn test_remove_stale_runtime_files_removes_starting_tombstone() {
let ws_id = format!("test_stale_tombstone_{}", std::process::id());
let ws_dir = workspaces_base_dir().unwrap().join(&ws_id);
fs::create_dir_all(&ws_dir).unwrap();

// Write all runtime files including the tombstone.
for name in &["node.info", "port", "pid", "starting"] {
fs::write(ws_dir.join(name), b"dummy").unwrap();
}

remove_stale_runtime_files(&ws_id).expect("remove_stale_runtime_files should not fail");

for name in &["node.info", "port", "pid", "starting"] {
assert!(
!ws_dir.join(name).exists(),
"{name} should have been removed by remove_stale_runtime_files"
);
}

// Cleanup
let _ = fs::remove_dir_all(&ws_dir);
}

/// Verify that `remove_stale_runtime_files` is idempotent — calling it when
/// the files are already absent (e.g. after a clean shutdown) must not error.
#[test]
fn test_remove_stale_runtime_files_idempotent_on_missing_files() {
let ws_id = format!("test_stale_idempotent_{}", std::process::id());
let ws_dir = workspaces_base_dir().unwrap().join(&ws_id);
fs::create_dir_all(&ws_dir).unwrap();

// No files written — must succeed without error.
remove_stale_runtime_files(&ws_id)
.expect("remove_stale_runtime_files should be idempotent on missing files");

let _ = fs::remove_dir_all(&ws_dir);
}
}