xiaozheyao
diff --git a/‎black.toml
+14 b/‎black.toml
+14
diff --git a/‎fmengine/cli/main.py
+1-3 b/‎fmengine/cli/main.py
+1-3
diff --git a/‎fmengine/cli/trainer.py
+7-10 b/‎fmengine/cli/trainer.py
+7-10
diff --git a/‎fmengine/core/checkpoint/__init__.py
+1-1 b/‎fmengine/core/checkpoint/__init__.py
+1-1
diff --git a/‎fmengine/core/checkpoint/checkpoint.py
+33-76 b/‎fmengine/core/checkpoint/checkpoint.py
+33-76
diff --git a/‎fmengine/core/checkpoint/train_state.py
+10-13 b/‎fmengine/core/checkpoint/train_state.py
+10-13
diff --git a/‎fmengine/core/configs/__init__.py
+2-3 b/‎fmengine/core/configs/__init__.py
+2-3
@@ -0,0 +1,14 @@
+[tool.black]
+line-length = 120
+target-version = ['py311']
+extend-exclude = '''
+(
+    .*/.*\.pyi
+  | .*/.*_grpc\.py
+  | .*/.*_pb2\.py
+  | .*/generated/
+  | .*/build/
+  | .*/libbuild/
+  | .*/clang-tidy-build/
+)
+'''
@@ -18,9 +18,7 @@ def train(config: str = typer.Option(..., help="Path to the config file")):
         raise ValueError(f"Config file not found: {config}")
     # if it is a directory, search for yaml files
     if os.path.isdir(config):
-        config_files = [
-            os.path.join(config, f) for f in os.listdir(config) if f.endswith(".yaml")
-        ]
+        config_files = [os.path.join(config, f) for f in os.listdir(config) if f.endswith(".yaml")]
         typer.echo(f"config files found: {config_files}")
         configs = [OmegaConf.load(f) for f in config_files]
         config = OmegaConf.merge(*configs)
 
@@ -1,12 +1,13 @@
+import contextlib
 import os
 import time
 
 import humanize
 import torch
-import contextlib
 from torch.distributed.elastic.multiprocessing.errors import record
 from torch.fx import GraphModule
 
+from fmengine.core.checkpoint import CheckpointManager, TrainState
 from fmengine.core.configs.train_config import TrainJobConfig
 from fmengine.core.nn import build_lr_scheduler, build_optimizer
 from fmengine.core.nn.loss import cross_entropy_loss
@@ -15,9 +16,8 @@
 from fmengine.models.builder import build_model
 from fmengine.models.llama.modeling_llama import parallelize_llama
 from fmengine.models.utils import get_num_params
-from fmengine.utilities import (GarbageCollection, build_gpu_memory_monitor,
-                                get_peak_flops, logger)
-from fmengine.core.checkpoint import CheckpointManager, TrainState
+from fmengine.utilities import GarbageCollection, build_gpu_memory_monitor, get_peak_flops, logger
+
 
 def get_train_context(enable_loss_parallel: bool, enable_compiled_autograd: bool):
     @contextlib.contextmanager
@@ -26,13 +26,12 @@ def context():
             if enable_loss_parallel:
                 stack.enter_context(torch.distributed.tensor.parallel.loss_parallel())
             if enable_compiled_autograd:
-                stack.enter_context(
-                    torch._dynamo.utils.maybe_enable_compiled_autograd(True)
-                )
+                stack.enter_context(torch._dynamo.utils.maybe_enable_compiled_autograd(True))
             yield
 
     return context
 
+
 @record
 def train_entry(job_config: TrainJobConfig):
     gc_handler = GarbageCollection()
@@ -64,8 +63,7 @@ def train_entry(job_config: TrainJobConfig):
     model_param_count = get_num_params(model)
     logger.info(f"Model has {humanize.intword(model_param_count)} parameters")
     # todo(xiaozhe): pipeline parallelism enabled
-    parallelize_llama(model, world_mesh, parallel_dims,
-                      train_config=job_config.training)
+    parallelize_llama(model, world_mesh, parallel_dims, train_config=job_config.training)
     init_device = "cuda"
     model.to_empty(device=init_device)
     model_parts = [model]
@@ -105,5 +103,4 @@ def train_entry(job_config: TrainJobConfig):
     )
     time.sleep(10000)
 
-
     torch.distributed.destroy_process_group()
@@ -1,4 +1,4 @@
 from .checkpoint import CheckpointManager
 from .train_state import TrainState
 
-__all__ = ["CheckpointManager", "TrainState"]
+__all__ = ["CheckpointManager", "TrainState"]
@@ -13,11 +13,13 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
-from torch.distributed.checkpoint.state_dict import (StateDictOptions,
-                                                     get_model_state_dict,
-                                                     get_optimizer_state_dict,
-                                                     set_model_state_dict,
-                                                     set_optimizer_state_dict)
+from torch.distributed.checkpoint.state_dict import (
+    StateDictOptions,
+    get_model_state_dict,
+    get_optimizer_state_dict,
+    set_model_state_dict,
+    set_optimizer_state_dict,
+)
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.utils.data import DataLoader
 
@@ -50,9 +52,7 @@ def __init__(self, model: Union[nn.Module, List[nn.Module]]) -> None:
         self.model = [model] if isinstance(model, nn.Module) else model
 
     def state_dict(self) -> None:
-        return {
-            k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
-        }
+        return {k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()}
 
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         func = functools.partial(
@@ -70,8 +70,7 @@ def __init__(
         optim: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]],
     ) -> None:
         self.model = [model] if isinstance(model, nn.Module) else model
-        self.optim = [optim] if isinstance(
-            optim, torch.optim.Optimizer) else optim
+        self.optim = [optim] if isinstance(optim, torch.optim.Optimizer) else optim
 
     def state_dict(self) -> None:
         func = functools.partial(
@@ -109,8 +108,7 @@ def checkpoint_mp(recv, send):
             state, checkpoint_id = obj
             dcp.save(state, checkpoint_id=checkpoint_id)
             logger.info(
-                "Finish saving the checkpoint in the background process in "
-                f"{time.monotonic() - begin:.2f} seconds."
+                "Finish saving the checkpoint in the background process in " f"{time.monotonic() - begin:.2f} seconds."
             )
     finally:
         logger.info("Destroying the process group.")
@@ -158,19 +156,11 @@ def __init__(
 
             TODO: This is currently unsolved and needs a fix.
         """
-        assert len(model_parts) == len(
-            optimizers
-        ), "Must pass one optimizer per model part"
-        assert len(model_parts) == len(
-            lr_schedulers
-        ), "Must pass one lr_scheduler per model part"
-
-        assert len(model_parts) == len(
-            optimizers
-        ), "Must pass one optimizer per model part"
-        assert len(model_parts) == len(
-            lr_schedulers
-        ), "Must pass one lr_scheduler per model part"
+        assert len(model_parts) == len(optimizers), "Must pass one optimizer per model part"
+        assert len(model_parts) == len(lr_schedulers), "Must pass one lr_scheduler per model part"
+
+        assert len(model_parts) == len(optimizers), "Must pass one optimizer per model part"
+        assert len(model_parts) == len(lr_schedulers), "Must pass one lr_scheduler per model part"
 
         self.states = states
 
@@ -190,11 +180,7 @@ def __init__(
                 self.states[f"lr_scheduler_{idx}"] = lr_scheduler
 
         self.folder = os.path.join(ckpt_config.ckpt_dir)
-        self.interval_type = (
-            IntervalType.SECONDS
-            if ckpt_config.interval_type == "seconds"
-            else IntervalType.STEPS
-        )
+        self.interval_type = IntervalType.SECONDS if ckpt_config.interval_type == "seconds" else IntervalType.STEPS
         self.interval = ckpt_config.interval
         self.begin_time = 0
         self.time_sync_work = None
@@ -231,12 +217,9 @@ def __init__(
             self.staging_id = None
             self.staging_stream = torch.cuda.Stream()
         else:
-            raise ValueError(
-                f"Unkown checkpoint async_mode {ckpt_config.async_mode}")
+            raise ValueError(f"Unkown checkpoint async_mode {ckpt_config.async_mode}")
 
-        logger.info(
-            f"Checkpointing active. Checkpoints will be loaded from and saved to {self.folder}"
-        )
+        logger.info(f"Checkpointing active. Checkpoints will be loaded from and saved to {self.folder}")
 
     def __del__(self):
         if self.enable_checkpoint and self.mp and self.mp.is_alive():
@@ -268,16 +251,12 @@ def _save_last_step(self, curr_step: int) -> None:
             self.states.pop("freqs_cis")
 
             if self.export_dtype != torch.float32:
-                self.states = {
-                    k: v.to(self.export_dtype) for k, v in self.states.items()
-                }
+                self.states = {k: v.to(self.export_dtype) for k, v in self.states.items()}
             logger.info(
-                f"Saving a model weights only checkpoint in {self.export_dtype} "
-                f"at last step, step {curr_step}."
+                f"Saving a model weights only checkpoint in {self.export_dtype} " f"at last step, step {curr_step}."
             )
         else:
-            logger.info(
-                f"Saving a full checkpoint at last step, step {curr_step}.")
+            logger.info(f"Saving a full checkpoint at last step, step {curr_step}.")
 
         dcp.save(self.states, checkpoint_id=self._create_checkpoint_id(curr_step))
         self.reset()
@@ -287,18 +266,13 @@ def _should_save(self, curr_step: int, force: bool = False) -> bool:
             return False
 
         if not force:
-            if self.interval_type == IntervalType.STEPS and not (
-                curr_step % self.interval == 0
-            ):
+            if self.interval_type == IntervalType.STEPS and not (curr_step % self.interval == 0):
                 return False
             if self.interval_type == IntervalType.SECONDS:
-                time_sync_result = (time.monotonic() -
-                                    self.begin_time) >= self.interval
+                time_sync_result = (time.monotonic() - self.begin_time) >= self.interval
                 self.time_sync_result = torch.tensor(int(time_sync_result))
                 if self.time_sync_work is None:
-                    self.time_sync_work = dist.all_reduce(
-                        self.time_sync_result, group=self.pg, async_op=True
-                    )
+                    self.time_sync_work = dist.all_reduce(self.time_sync_result, group=self.pg, async_op=True)
                     return False
                 elif curr_step % 5 == 4:
                     self.time_sync_work.wait()
@@ -319,31 +293,25 @@ def _should_save(self, curr_step: int, force: bool = False) -> bool:
 
     def _async_wait(self) -> None:
         if self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM:
-            logger.debug(
-                f"Waiting for the background process to finish, {time.monotonic()=}.:.2f"
-            )
+            logger.debug(f"Waiting for the background process to finish, {time.monotonic()=}.:.2f")
             if not self.mp.is_alive():
-                raise RuntimeError(
-                    "The checkpoint background process is dead.")
+                raise RuntimeError("The checkpoint background process is dead.")
             _ = self.mp_queue_recv.get()
         elif self.async_mode == AsyncMode.ASYNC:
             if self.async_future is not None:
                 self.async_future.result()
 
     def _async_with_pinned_memory(self, checkpoint_id: str) -> None:
         try:
-            from torch.distributed._state_dict_utils import (
-                _copy_state_dict, _create_cpu_state_dict)
+            from torch.distributed._state_dict_utils import _copy_state_dict, _create_cpu_state_dict
         except ImportError as e:
             raise ImportError(
                 "Please install the latest PyTorch nightly to use async checkpointing with pinned memory."
             ) from e
         state_dict = dcp.state_dict_saver._stateful_to_state_dict(self.states)
         if self.cpu_offload_state_dict is None:
             logger.debug(f"Preparing the CPU memory, {time.monotonic()=}.:.2f")
-            self.cpu_offload_state_dict = _create_cpu_state_dict(
-                state_dict, pin_memory=True
-            )
+            self.cpu_offload_state_dict = _create_cpu_state_dict(state_dict, pin_memory=True)
 
         logger.debug(f"Staging the state_dict, {time.monotonic()=}.:.2f")
         with torch.cuda.stream(self.staging_stream):
@@ -374,9 +342,7 @@ def save(self, curr_step: int, force: bool = False) -> None:
         elif self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM:
             self._async_with_pinned_memory(checkpoint_id)
         elif self.async_mode == AsyncMode.ASYNC:
-            self.async_future = dcp.async_save(
-                self.states, checkpoint_id=checkpoint_id, process_group=self.pg
-            )
+            self.async_future = dcp.async_save(self.states, checkpoint_id=checkpoint_id, process_group=self.pg)
         else:
             dcp.save(self.states, checkpoint_id=checkpoint_id)
         self.reset()
@@ -388,16 +354,10 @@ def save(self, curr_step: int, force: bool = False) -> None:
         )
 
     def maybe_wait_for_staging(self) -> None:
-        if (
-            self.enable_checkpoint
-            and self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM
-            and self.staging
-        ):
+        if self.enable_checkpoint and self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM and self.staging:
             logger.debug(f"Waiting for staging, {time.monotonic()=:.2f}.")
             self.staging_stream.synchronize()
-            logger.debug(
-                f"Sending the state dict to the background process, {time.monotonic()=:.2f}."
-            )
+            logger.debug(f"Sending the state dict to the background process, {time.monotonic()=:.2f}.")
             self.mp_queue_send.put((self.staging_state_dict, self.staging_id))
             self.staging = False
 
@@ -413,8 +373,7 @@ def load(self, step: int = -1) -> bool:
             step_counts = []
             for filename in os.listdir(self.folder):
                 match = re.search(r"step-(\d+)", filename)
-                metadata_probe = os.path.join(
-                    self.folder, filename, ".metadata")
+                metadata_probe = os.path.join(self.folder, filename, ".metadata")
                 if match and os.path.isfile(metadata_probe):
                     step_counts.append(int(match.group(1)))
             if not step_counts:
@@ -429,9 +388,7 @@ def load(self, step: int = -1) -> bool:
             states,
             checkpoint_id=self._create_checkpoint_id(step),
         )
-        logger.info(
-            f"Finished loading the checkpoint in {time.monotonic() - begin:.2f} seconds."
-        )
+        logger.info(f"Finished loading the checkpoint in {time.monotonic() - begin:.2f} seconds.")
         return True
 
     def _purge_stale_checkpoints(self):
 
@@ -13,11 +13,13 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
-from torch.distributed.checkpoint.state_dict import (StateDictOptions,
-                                                     get_model_state_dict,
-                                                     get_optimizer_state_dict,
-                                                     set_model_state_dict,
-                                                     set_optimizer_state_dict)
+from torch.distributed.checkpoint.state_dict import (
+    StateDictOptions,
+    get_model_state_dict,
+    get_optimizer_state_dict,
+    set_model_state_dict,
+    set_optimizer_state_dict,
+)
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.utils.data import DataLoader
 
@@ -48,13 +50,8 @@ def state_dict(self) -> Dict[str, Any]:
     def load_state_dict(self, state_dict) -> None:
         self.step = state_dict["step"].item()
         state_dict["global_avg_losses"].seek(0)
-        self.global_avg_losses = torch.load(
-            state_dict["global_avg_losses"], weights_only=False
-        )
+        self.global_avg_losses = torch.load(state_dict["global_avg_losses"], weights_only=False)
         state_dict["global_max_losses"].seek(0)
-        self.global_max_losses = torch.load(
-            state_dict["global_max_losses"], weights_only=False
-        )
+        self.global_max_losses = torch.load(state_dict["global_max_losses"], weights_only=False)
         state_dict["log_steps"].seek(0)
-        self.log_steps = torch.load(
-            state_dict["log_steps"], weights_only=False)
+        self.log_steps = torch.load(state_dict["log_steps"], weights_only=False)
@@ -1,5 +1,4 @@
-from .train_config import (CheckpointConfig, FP8Config, OptimizerConfig,
-                           TokenizerConfig, TrainingConfig, TrainJobConfig)
+from .train_config import CheckpointConfig, FP8Config, OptimizerConfig, TokenizerConfig, TrainingConfig, TrainJobConfig
 from .utils import TORCH_DTYPE_MAP, dict_to_config
 
 __all__ = [
@@ -10,5 +9,5 @@
     "FP8Config",
     "OptimizerConfig",
     "TokenizerConfig",
-    "TrainingConfig"
+    "TrainingConfig",
 ]