PrimeIntellect-ai
diff --git a/‎.github/workflows/type-check-mypy.yaml
Lines changed: 23 additions & 0 deletions b/‎.github/workflows/type-check-mypy.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎distributed_shampoo/distributed_shampoo.py
Lines changed: 11 additions & 14 deletions b/‎distributed_shampoo/distributed_shampoo.py
Lines changed: 11 additions & 14 deletions
diff --git a/‎distributed_shampoo/examples/default_cifar10_example.py
Lines changed: 7 additions & 3 deletions b/‎distributed_shampoo/examples/default_cifar10_example.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎distributed_shampoo/examples/fully_shard_cifar10_example.py
Lines changed: 9 additions & 4 deletions b/‎distributed_shampoo/examples/fully_shard_cifar10_example.py
Lines changed: 9 additions & 4 deletions
diff --git a/‎distributed_shampoo/examples/trainer_utils.py
Lines changed: 15 additions & 13 deletions b/‎distributed_shampoo/examples/trainer_utils.py
Lines changed: 15 additions & 13 deletions
diff --git a/‎distributed_shampoo/gpu_tests/shampoo_pt2_test.py
Lines changed: 2 additions & 2 deletions b/‎distributed_shampoo/gpu_tests/shampoo_pt2_test.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎distributed_shampoo/shampoo_types.py
Lines changed: 1 addition & 1 deletion b/‎distributed_shampoo/shampoo_types.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎distributed_shampoo/utils/gpu_tests/shampoo_dist_utils_test.py
Lines changed: 3 additions & 5 deletions b/‎distributed_shampoo/utils/gpu_tests/shampoo_dist_utils_test.py
Lines changed: 3 additions & 5 deletions
diff --git a/‎distributed_shampoo/utils/gpu_tests/shampoo_fsdp_utils_test.py
Lines changed: 3 additions & 3 deletions b/‎distributed_shampoo/utils/gpu_tests/shampoo_fsdp_utils_test.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎distributed_shampoo/utils/gpu_tests/shampoo_fully_shard_distributor_test.py
Lines changed: 6 additions & 4 deletions b/‎distributed_shampoo/utils/gpu_tests/shampoo_fully_shard_distributor_test.py
Lines changed: 6 additions & 4 deletions
@@ -0,0 +1,23 @@
+name: type-check-mypy
+
+on: [push, pull_request]
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    steps:
+        - uses: actions/checkout@v4
+        - name: Set up and update uv.
+          run: |
+            curl -LsSf https://astral.sh/uv/install.sh | sh
+            uv self update
+        - name: Install Python.
+          run: uv python install 3.10
+        - name: Create venv and install the package.
+          run: |
+            uv venv && source .venv/bin/activate
+            uv pip install -e ".[dev]"
+        - name: Run type checking with mypy.
+          run: |
+            source .venv/bin/activate
+            make type-check
@@ -12,7 +12,7 @@ We actively welcome your pull requests for existing optimizers.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes. To run the subset of the tests that can be run on CPU use `make test`; to run the tests for a single GPU use `make test-gpu` and to run the subset of tests that require 2-4 GPUs use `make test-multi-gpu`.
-5. Make sure your code lints. You can use `make lint` and `make format` to automatically lint and format the code where possible.
+5. Make sure your code lints. You can use `make lint` and `make format` to automatically lint and format the code where possible. Use `make type-check` for type checking.
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
 ## Contributor License Agreement ("CLA")
 
@@ -502,18 +502,18 @@ def _instantiate_distributor(
         if distributed_config is None:
             distributor = Distributor
         elif type(distributed_config) is DDPShampooConfig:
-            distributor = partial(DDPDistributor, distributed_config=distributed_config)
+            distributor = partial(DDPDistributor, distributed_config=distributed_config)  # type: ignore[assignment]
         elif type(distributed_config) is FSDPShampooConfig:
             distributor = partial(
                 FSDPDistributor, distributed_config=distributed_config
-            )
+            )  # type: ignore[assignment]
         elif type(distributed_config) is FullyShardShampooConfig:
             distributor = FullyShardDistributor
         elif type(distributed_config) is HSDPShampooConfig:
             distributor = partial(
                 HSDPDistributor,
                 distributed_config=distributed_config,
-            )
+            )  # type: ignore[assignment]
         else:
             raise NotImplementedError(f"{distributed_config=} not supported!")
 
@@ -808,10 +808,7 @@ def _compute_and_log_root_inverse_residuals(
         Uses infinity norm to evaluate residuals and errors.
         """
 
-        # Accumulate relative errors/residuals
-        relative_errors = []
-        relative_residuals = []
-
+        # Compute relative errors/residuals for each group.
         for (group_index, group), state_lists in zip(
             enumerate(self.param_groups), self._per_group_state_lists, strict=True
         ):
@@ -827,12 +824,12 @@ def _compute_and_log_root_inverse_residuals(
                 )
                 continue
 
-            relative_errors, relative_residuals = state_lists[
-                SHAMPOO_PRECONDITIONER_LIST
-            ].compute_root_inverse_residuals()
-
-            relative_errors = torch.stack(relative_errors)
-            relative_residuals = torch.stack(relative_residuals)
+            relative_errors, relative_residuals = map(
+                torch.stack,
+                state_lists[
+                    SHAMPOO_PRECONDITIONER_LIST
+                ].compute_root_inverse_residuals(),
+            )
 
             quantiles = torch.as_tensor(
                 [0, 0.25, 0.5, 0.75, 1],
@@ -1141,7 +1138,7 @@ def _per_group_step_impl(
         )
 
     @torch.no_grad()
-    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:  # type: ignore[override]
         """Performs a single optimization step.
 
         Args:
 
@@ -40,7 +40,7 @@ def train_default_model(
     loss_function: nn.Module,
     data_loader: torch.utils.data.DataLoader,
     optimizer: torch.optim.Optimizer,
-    device: str,
+    device: torch.device,
     epochs: int = 1,
     window_size: int = 100,
 ) -> Tuple[float, float, int]:
@@ -62,7 +62,11 @@ def train_default_model(
             metrics.update(loss)
             metrics.log()
 
-    return metrics._lifetime_loss, metrics._window_loss, metrics._iteration
+    return (
+        metrics._lifetime_loss.item(),
+        metrics._window_loss.item(),
+        metrics._iteration,
+    )
 
 
 if __name__ == "__main__":
@@ -97,7 +101,7 @@ def train_default_model(
     set_seed(args.seed)
 
     # check cuda availability and set device
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     # instantiate model and loss function
     model, loss_function = get_model_and_loss_fn(device)
 
@@ -9,7 +9,7 @@
 
 import logging
 import os
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -53,7 +53,7 @@ def train_fully_shard_model(
     sampler: torch.utils.data.Sampler,
     data_loader: torch.utils.data.DataLoader,
     optimizer: torch.optim.Optimizer,
-    device: Union[str, torch.device],
+    device: torch.device,
     epochs: int = 1,
     window_size: int = 100,
     use_distributed_checkpoint: bool = False,
@@ -71,7 +71,7 @@ def train_fully_shard_model(
     # main training loop
     for epoch in range(epochs):
         metrics._epoch = epoch
-        sampler.set_epoch(epoch)
+        sampler.set_epoch(epoch)  # type: ignore[attr-defined]
 
         for inputs, labels in data_loader:
             inputs, labels = inputs.to(device), labels.to(device)
@@ -89,6 +89,7 @@ def train_fully_shard_model(
 
     # checkpoint optimizer and model using distributed checkpointing solution
     if use_distributed_checkpoint and isinstance(optimizer, DistributedShampoo):
+        assert checkpoint_dir is not None
         state_dict = {
             "model": model.state_dict(),
             "optim": optimizer.distributed_state_dict(
@@ -100,7 +101,11 @@ def train_fully_shard_model(
             storage_writer=dist_checkpoint.FileSystemWriter(checkpoint_dir),
         )
 
-    return metrics._lifetime_loss, metrics._window_loss, metrics._iteration
+    return (
+        metrics._lifetime_loss.item(),
+        metrics._window_loss.item(),
+        metrics._iteration,
+    )
 
 
 def create_model_and_optimizer_and_loss_fn(args, device):
 
@@ -12,7 +12,7 @@
 import logging
 import random
 from abc import ABC, abstractmethod
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import numpy as np
 
@@ -39,7 +39,7 @@
     PreconditionerComputationConfig,
 )
 from torch import nn
-from torchvision import datasets, transforms
+from torchvision import datasets, transforms  # type: ignore[import-untyped]
 
 logger = logging.getLogger(__name__)
 
@@ -79,10 +79,10 @@ class PreconditionerComputationType(enum.Enum):
 ###### ARGPARSER ######
 def enum_type_parse(s: str, enum_type: enum.Enum):
     try:
-        return enum_type[s]
+        return enum_type[s]  # type: ignore[index]
     except KeyError:
         raise argparse.ArgumentTypeError(
-            "Use one of {}".format(", ".join([t.name for t in enum_type]))
+            "Use one of {}".format(", ".join([t.name for t in enum_type]))  # type: ignore[attr-defined]
         )
 
 
@@ -349,7 +349,7 @@ def log(self): ...
     def reset(self): ...
 
     @abstractmethod
-    def update(self): ...
+    def update(self, loss: torch.Tensor): ...
 
 
 class LossMetrics(Metrics):
@@ -365,7 +365,7 @@ def __init__(
         self._device = device
         self._epoch = 0
         self._iteration = 0
-        self._window_losses = []
+        self._window_losses: list[torch.Tensor] = []
         self._window_loss = torch.tensor(0.0, device=device)
         self._accumulated_loss = torch.tensor(0.0, device=device)
         self._lifetime_loss = torch.tensor(0.0, device=device)
@@ -461,15 +461,15 @@ def instantiate_optimizer(
                 betas=betas,
                 eps=epsilon,
                 weight_decay=weight_decay,
-            )
+            )  # type: ignore[assignment]
         else:
             optimizer = torch.optim.Adam(
                 model.parameters(),
                 lr=lr,
                 betas=betas,
                 eps=epsilon,
                 weight_decay=weight_decay,
-            )
+            )  # type: ignore[assignment]
     elif optimizer_type == OptimizerType.DISTRIBUTED_SHAMPOO:
         optimizer = DistributedShampoo(
             model.parameters(),
@@ -500,7 +500,7 @@ def instantiate_optimizer(
             preconditioner_computation_config=instantiate_preconditioner_computation_config(
                 preconditioner_computation_type
             ),
-        )
+        )  # type: ignore[assignment]
     else:
         raise ValueError(f"Invalid OptimizerType {optimizer_type}!")
 
@@ -576,8 +576,10 @@ def get_data_loader_and_sampler(
     dataset = datasets.CIFAR10(
         data_path, train=True, download=True, transform=transform
     )
-    sampler = torch.utils.data.distributed.DistributedSampler(
-        dataset, num_replicas=world_size, rank=rank, shuffle=True
+    sampler: torch.utils.data.distributed.DistributedSampler = (
+        torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, shuffle=True
+        )
     )
     return (
         torch.utils.data.DataLoader(
@@ -636,7 +638,7 @@ def train_model(
     sampler: torch.utils.data.Sampler,
     data_loader: torch.utils.data.DataLoader,
     optimizer: torch.optim.Optimizer,
-    device: Union[str, torch.device],
+    device: torch.device,
     epochs: int = 1,
     window_size: int = 100,
     local_rank: int = 0,
@@ -647,7 +649,7 @@ def train_model(
     # main training loop
     for epoch in range(epochs):
         metrics._epoch = epoch
-        sampler.set_epoch(epoch)
+        sampler.set_epoch(epoch)  # type: ignore[attr-defined]
 
         for inputs, labels in data_loader:
             inputs, labels = inputs.to(device), labels.to(device)
 
@@ -86,12 +86,12 @@ def _test_shampoo_baseline_and_pt2(
 
     @staticmethod
     def _shampoo_optim_factory(
-        shampoo_pt2_compile_config: ShampooPT2CompileConfig,
+        shampoo_pt2_compile_config: ShampooPT2CompileConfig | None,
         precondition_frequency: int,
         start_preconditioning_step: int,
         weight_decay: float,
         betas: tuple[float, float],
-        grafting_config: GraftingConfig,
+        grafting_config: GraftingConfig | None,
     ) -> Callable[[ParamsT], torch.optim.Optimizer]:
         return lambda parameters: DistributedShampoo(
             parameters,
 
@@ -130,7 +130,7 @@ class PrecisionConfig:
 
 @dataclass
 class AbstractDataclass:
-    def __new__(cls, *args: Any, **kwargs: Any) -> Optional["AbstractDataclass"]:
+    def __new__(cls, *args: Any, **kwargs: Any) -> "AbstractDataclass":
         if cls == AbstractDataclass or cls.__bases__[0] == AbstractDataclass:
             raise TypeError(f"Cannot instantiate abstract class: {cls.__name__}.")
         return super().__new__(cls)
 
@@ -42,8 +42,7 @@ def _verify_deivce_mesh(self, device_mesh: DeviceMesh) -> None:
             (shard_mesh.get_group(), replicate_mesh.get_group()),
         )
 
-    # type: ignore
-    @with_comms
+    @with_comms  # type: ignore
     def test_get_device_mesh(self) -> None:
         mesh = tuple(
             map(
@@ -57,8 +56,7 @@ def test_get_device_mesh(self) -> None:
 
         self._verify_deivce_mesh(
             device_mesh=get_device_mesh(
-                # type: ignore
-                device_type=self.device_type,
+                device_type=self.device_type,  # type: ignore
                 mesh=mesh,
                 mesh_dim_names=("replicate", "shard"),
             )
@@ -72,7 +70,7 @@ def test_get_device_mesh(self) -> None:
             "__init__",
         ) as mock_device_mesh_init:
             device_mesh = get_device_mesh(
-                device_type=self.device_type,
+                device_type=self.device_type,  # type: ignore[attr-defined]
                 mesh=mesh,
                 mesh_dim_names=("replicate", "shard"),
             )
 
@@ -102,7 +102,7 @@ def test_compile_fsdp_parameter_metadata_with_no_flat_param(self) -> None:
         fsdp_model = FSDP(model, use_orig_params=True, ignored_states=params)
         actual_fsdp_parameter_metadata = compile_fsdp_parameter_metadata(fsdp_model)
 
-        expected_fsdp_parameter_metadata = {}
+        expected_fsdp_parameter_metadata = {}  # type: ignore[var-annotated]
 
         self.assertEqual(
             actual_fsdp_parameter_metadata, expected_fsdp_parameter_metadata
@@ -117,7 +117,7 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(4)
     def test_parse_fsdp_params(self) -> None:
-        HYBRID_SHARDING_STRATEGIES_TO_EXPECTED_KEYS = {
+        HYBRID_SHARDING_STRATEGIES_TO_EXPECTED_KEYS = {  # type: ignore[var-annotated]
             ShardingStrategy.HYBRID_SHARD: (
                 [],
                 [
@@ -135,7 +135,7 @@ def test_parse_fsdp_params(self) -> None:
                 ["1.weight"],
             ),
         }
-        SHARDING_STRATEGIES_TO_EXPECTED_KEYS = {
+        SHARDING_STRATEGIES_TO_EXPECTED_KEYS = {  # type: ignore[var-annotated]
             ShardingStrategy.NO_SHARD: (
                 [],
                 [],
 
@@ -108,14 +108,16 @@ def _train_model(
         if uses_fully_shard:
             # When FullyShard is used, model parameters are DTensors. We obtain the full value of
             # parameters from DTensors.
-            params = []
+            params_list = []
             for param in model.parameters():
                 # Need this assertion to get pass type-checking test.
                 assert isinstance(param, DTensor)
-                params.append(param.full_tensor().view(-1).detach().cpu())
+                params_list.append(param.full_tensor().view(-1).detach().cpu())
         else:
-            params = [param.view(-1).detach().cpu() for param in model.parameters()]
-        return params, objective.detach().cpu()
+            params_list = [
+                param.view(-1).detach().cpu() for param in model.parameters()
+            ]
+        return params_list, objective.detach().cpu()
 
     @staticmethod
     def _test_two_configs(