pytorch · tushar00jain · Feb 11, 2026 · Feb 11, 2026
@@ -58,7 +58,9 @@ def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir
             cmd = (
                 f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" '
                 + f"CUDA_VISIBLE_DEVICES={ranks} "
+                + "TRAIN_FILE=torchtitan.experiments.ft.train "
                 + f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} ./run_train.sh "
+                + "--model.name=llama3_ft "
                 + "--fault_tolerance.enable "
                 + f"--fault_tolerance.replica_id={replica_id} --fault_tolerance.group_size={test_flavor.ngpu}"
             )

diff --git a/docs/torchft.md → torchtitan/experiments/ft/torchft.md b/docs/torchft.md → torchtitan/experiments/ft/torchft.md
@@ -36,12 +36,12 @@ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join
 2. Launch the first TorchTitan instance:
 
 ```bash
-NGPU=4 CUDA_VISIBLE_DEVICES=0,1,2,3 CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --fault_tolerance.enable --fault_tolerance.replica_id=0 --fault_tolerance.group_size=2 --parallelism.data_parallel_shard_degree=4
+NGPU=4 CUDA_VISIBLE_DEVICES=0,1,2,3 TRAIN_FILE=torchtitan.experiments.ft.train CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --fault_tolerance.enable --fault_tolerance.replica_id=0 --fault_tolerance.group_size=2 --parallelism.data_parallel_shard_degree=4
 ```
 3. Launch the second TorchTitan instance:
 
 ```bash
-NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --fault_tolerance.enable --fault_tolerance.replica_id=1 --fault_tolerance.group_size=2 --parallelism.data_parallel_shard_degree=4
+NGPU=4 CUDA_VISIBLE_DEVICES=4,5,6,7 TRAIN_FILE=torchtitan.experiments.ft.train CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --fault_tolerance.enable --fault_tolerance.replica_id=1 --fault_tolerance.group_size=2 --parallelism.data_parallel_shard_degree=4
 ```
 
 ### Explanation
@@ -68,12 +68,12 @@ The `--training.global_batch_size` parameter refers to global batch size that wi
 
 #### Replica Group 0
 ```bash
-CONFIG_FILE=./torchtitan/models/llama3_ft/train_configs/debug_model.toml CUDA_VISIBLE_DEVICES=0,1,2,3 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=0
+TRAIN_FILE=torchtitan.experiments.ft.train CONFIG_FILE=./torchtitan/models/llama3_ft/train_configs/debug_model.toml CUDA_VISIBLE_DEVICES=0,1,2,3 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=0
 ```
 
 #### Replica Group 1
 ```bash
-CONFIG_FILE=./torchtitan/models/llama3_ft/train_configs/debug_model.toml CUDA_VISIBLE_DEVICES=4,5,6,7 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=1
+TRAIN_FILE=torchtitan.experiments.ft.train CONFIG_FILE=./torchtitan/models/llama3_ft/train_configs/debug_model.toml CUDA_VISIBLE_DEVICES=4,5,6,7 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=1
 ```
 
 ## Fault Tolerance Configuration Options