deepspeedai · NirSonnenschein · Oct 23, 2025 · Nov 2, 2025 · Nov 12, 2025 · stas00
@@ -31,6 +31,7 @@ def pytest_configure(config):
 def pytest_addoption(parser):
     parser.addoption("--torch_ver", default=None, type=str)
     parser.addoption("--cuda_ver", default=None, type=str)
+    parser.addoption("--enable-compile-mode", action="store_true", help="Run both compiled/non-compiled versions")
-    parser.addoption("--enable-compile-mode", action="store_true", help="Run both compiled/non-compiled versions")
+    parser.addoption("--enable-torch-compile", action="store_true", help="Run both compiled/non-compiled versions")
-    parser.addoption("--enable-compile-mode", action="store_true", help="Run both compiled/non-compiled versions")
+    parser.addoption("--enable-torch-compile", action="store_true", help="Run both compiled/non-compiled versions")
 
 
 def validate_version(expected, found):
@@ -70,6 +71,13 @@ def pytest_runtest_call(item):
         item.runtest = lambda: True  # Dummy function so test is not run twice
 
 
+def pytest_generate_tests(metafunc):
+    if "compile_mode" in metafunc.fixturenames:
+        compile_testing_enabled = metafunc.config.getoption("--enable-compile-mode")
+        params = [False, True] if compile_testing_enabled else [False]
+        metafunc.parametrize("compile_mode", params)
+
+
 # We allow DistributedTest to reuse distributed environments. When the last
 # test for a class is run, we want to make sure those distributed environments
 # are destroyed.

@@ -174,11 +174,14 @@ def checkpoint_correctness_verification(config_dict,
                                         empty_tag=False,
                                         seq_dataloader=False,
                                         load_module_only=False,
-                                        dtype=None):
+                                        dtype=None,
+                                        compile_mode=False):
     if dtype is None:
         dtype = preferred_dtype()
 
     ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])
+    if compile_mode:
+        ds_model.compile()
 
     if seq_dataloader:
         data_loader = sequence_dataloader(model=ds_model,

@@ -19,7 +19,7 @@
 class TestLatestCheckpoint(DistributedTest):
     world_size = 1
 
-    def test_existing_latest(self, tmpdir):
+    def test_existing_latest(self, tmpdir, compile_mode):
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -39,9 +39,10 @@ def test_existing_latest(self, tmpdir):
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=False,
                                             empty_tag=True,
-                                            dtype=torch.float)
+                                            dtype=torch.float,
+                                            compile_mode=compile_mode)
 
-    def test_missing_latest(self, tmpdir):
+    def test_missing_latest(self, tmpdir, compile_mode):
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -55,5 +56,7 @@ def test_missing_latest(self, tmpdir):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        if compile_mode:
+            model.compile()
         # should be no-op, since latest doesn't exist
         model.load_checkpoint(tmpdir)
@@ -20,7 +20,7 @@
 class TestLRSchedulerCheckpoint(DistributedTest):
     world_size = 2
 
-    def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
+    def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload, compile_mode):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
         if get_accelerator().device_name() == 'cpu':
@@ -70,9 +70,10 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                                             hidden_dim,
                                             tmpdir,
                                             load_optimizer_states=False,
-                                            load_lr_scheduler_states=True)
+                                            load_lr_scheduler_states=True,
+                                            compile_mode=compile_mode)
 
-    def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
+    def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload, compile_mode):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
         if get_accelerator().device_name() == 'cpu':
@@ -117,4 +118,5 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                                             hidden_dim,
                                             tmpdir,
                                             load_optimizer_states=False,
-                                            load_lr_scheduler_states=False)
+                                            load_lr_scheduler_states=False,
+                                            compile_mode=compile_mode)
@@ -39,7 +39,7 @@ def test_checkpoint_moe(self, tmpdir, ep_size):
                                             dtype=torch.float16)
 
     @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)])
-    def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
+    def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states, compile_mode):
         if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
@@ -80,4 +80,5 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
                                             empty_tag=True,
                                             base_optimizers=optimizers,
                                             seq_dataloader=True,
-                                            dtype=torch.float16)
+                                            dtype=torch.float16,
+                                            compile_mode=compile_mode)
@@ -18,7 +18,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
     world_size = 2
 
     @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
-    def test_checkpoint_unfused_optimizer(self, tmpdir):
+    def test_checkpoint_unfused_optimizer(self, tmpdir, compile_mode):
         #if not get_accelerator().is_fp16_supported():
         #    pytest.skip("fp16 is not supported")
         config_dict = {
@@ -67,17 +67,19 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
                                             load_optimizer_states=True,
-                                            dtype=dtype)
+                                            dtype=dtype,
+                                            compile_mode=compile_mode)
 
         # Ignore optimizer states
         checkpoint_correctness_verification(config_dict,
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
                                             load_optimizer_states=False,
-                                            dtype=dtype)
+                                            dtype=dtype,
+                                            compile_mode=compile_mode)
 
-    def test_checkpoint_fused_optimizer(self, tmpdir):
+    def test_checkpoint_fused_optimizer(self, tmpdir, compile_mode):
         if get_accelerator().device_name() == "cpu":
             pytest.skip("CPU accelerator does not support this test")
         config_dict = {
@@ -108,17 +110,19 @@ def test_checkpoint_fused_optimizer(self, tmpdir):
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
                                             load_optimizer_states=True,
-                                            dtype=dtype)
+                                            dtype=dtype,
+                                            compile_mode=compile_mode)
 
         # Ignore optimizer states
         checkpoint_correctness_verification(config_dict,
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
                                             load_optimizer_states=False,
-                                            dtype=dtype)
+                                            dtype=dtype,
+                                            compile_mode=compile_mode)
 
-    def test_checkpoint_fp32_optimizer(self, tmpdir):
+    def test_checkpoint_fp32_optimizer(self, tmpdir, compile_mode):
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -143,4 +147,5 @@ def test_checkpoint_fp32_optimizer(self, tmpdir):
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            dtype=torch.float32)
+                                            dtype=torch.float32,
+                                            compile_mode=compile_mode)
@@ -16,7 +16,7 @@ class TestPipelineCheckpoint(DistributedTest):
     world_size = 4
 
     @pytest.mark.parametrize("zero_stage", [0, 1])
-    def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
+    def test_checkpoint_pipe_engine(self, zero_stage, tmpdir, compile_mode):
         skip_on_arch(min_arch=7)
 
         config_dict = {
@@ -61,7 +61,8 @@ def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=True,
                                             train_batch=True,
-                                            dtype=torch.float16 if zero_stage > 0 else torch.float32)
+                                            dtype=torch.float16 if zero_stage > 0 else torch.float32,
+                                            compile_mode=compile_mode)
 
     @pytest.mark.parametrize(
         "base_topo,test_topo",

@@ -25,7 +25,7 @@ def __init__(self):
 class TestCheckpointSharedWeights(DistributedTest):
     world_size = 2
 
-    def test_checkpoint_shared_weights(self, tmp_path):
+    def test_checkpoint_shared_weights(self, tmp_path, compile_mode):
         config = {
             "train_micro_batch_size_per_gpu": 2,
             "zero_allow_untested_optimizer": True,
@@ -41,6 +41,9 @@ def test_checkpoint_shared_weights(self, tmp_path):
             model=model,
             optimizer=optimizer,
         )
+        if compile_mode:
+            deepspeed_engine.compile()
+
         filename = tmp_path / "checkpoint.pt"
         deepspeed_engine.save_checkpoint(filename, tag="checkpoint")
 

@@ -25,7 +25,7 @@ class TestSparseCheckpoint(DistributedTest):
         [True, True],
     ])
     def test_non_strict_load_sparse(self, tmpdir, to_save_model_has_embedding, to_save_model_sparse,
-                                    destination_has_embedding, destination_sparse):
+                                    destination_has_embedding, destination_sparse, compile_mode):
 
         class ModelNoEmbedding(torch.nn.Module):
 
@@ -66,6 +66,10 @@ def forward(self, x, offsets):
                                                                "sparse_gradients": destination_sparse
                                                            })
 
+        if compile_mode:
+            engine_to_save.compile()
+            engine_destination.compile()
+
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
         save_tag = '1'
 

@@ -15,7 +15,7 @@ class TestCheckpointValidationTag(DistributedTest):
     world_size = 2
 
     @pytest.mark.parametrize('valid_mode', ["FAIL", "WARN", "IGNORE"])
-    def test_checkpoint_unique_tag(self, tmpdir, valid_mode):
+    def test_checkpoint_unique_tag(self, tmpdir, valid_mode, compile_mode):
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -33,13 +33,15 @@ def test_checkpoint_unique_tag(self, tmpdir, valid_mode):
         model = SimpleModel(hidden_dim)
 
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        if compile_mode:
+            model.compile()
         if valid_mode == "FAIL":
             with pytest.raises(AssertionError):
                 model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
         else:
             model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
 
-    def test_checkpoint_unknown_tag_validation(self, tmpdir):
+    def test_checkpoint_unknown_tag_validation(self, tmpdir, compile_mode):
 
         config_dict = {
             "train_batch_size": 2,
@@ -60,3 +62,5 @@ def test_checkpoint_unknown_tag_validation(self, tmpdir):
 
         with pytest.raises(deepspeed.DeepSpeedConfigError):
             model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+            if compile_mode:
+                model.compile()
@@ -213,7 +213,7 @@ def update_gathered_stage3_optimizer(optimizer_state, param_shapes, world_size):
 @pytest.mark.parametrize("sub_group_size", [-1, 100])
 class TestZeROUniversalCheckpointDP(DistributedTest):
 
-    def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam, world_size):
+    def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam, world_size, compile_mode):
         if dtype == torch.bfloat16 and not bf16_required_version_check():
             pytest.skip(
                 " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
@@ -225,6 +225,9 @@ def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam, world_
         ds_config["checkpoint"] = {"load_universal": True}
         univ_model = SimpleModel(hidden_dim, nlayers=2)
         univ_model = init_ds_engine(univ_model, ds_config, use_torch_adam)
+        if compile_mode:
+            univ_model.compile()
+
         univ_model.load_checkpoint(tmpdir, tag=f"{CP_TAG}_universal", load_optimizer_states=load_optim)
 
         model_state = univ_model.state_dict()
@@ -260,13 +263,16 @@ def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam, world_
         univ_model.destroy()
 
     @pytest.mark.world_size(2)
-    def test_dp_world_size_2to2(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
-        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, 2)
+    def test_dp_world_size_2to2(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam,
+                                compile_mode):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, compile_mode)
 
     @pytest.mark.world_size(2)
-    def test_dp_world_size_4to2(self, baseline_ws4, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
-        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, 2)
+    def test_dp_world_size_4to2(self, baseline_ws4, tmpdir, dtype, ds_config, load_optim, use_torch_adam,
+                                compile_mode):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, compile_mode)
 
     @pytest.mark.world_size(4)
-    def test_dp_world_size_2to4(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
-        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, 4)
+    def test_dp_world_size_2to4(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam,
+                                compile_mode):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam, 4, compile_mode)