Skip to content

Commit

Permalink
tensor signature support DTensor
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Oct 7, 2024
1 parent 254268d commit c7605c8
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
12 changes: 6 additions & 6 deletions src/zeroband/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,13 @@ def train(config: Config):
memory_profiler.step()

if config.diloco is not None:
# if config.train.log_model_hash:
# with FSDP.summon_full_params(model):
# logger.debug("Pre diloco model: %s", get_module_signature(model))
if config.train.log_model_hash:
logger.debug("Pre diloco model: %s", get_module_signature(model))

diloco.step(model)
# if config.train.log_model_hash:
# with FSDP.summon_full_params(model):
# logger.debug("Post diloco model: %s", get_module_signature(model))

if config.train.log_model_hash:
logger.debug("Post diloco model: %s", get_module_signature(model))

training_progress.outer_step += 1

Expand Down
5 changes: 5 additions & 0 deletions src/zeroband/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any
import torch
from torch.distributed.fsdp import ShardingStrategy
from torch.distributed._tensor.api import DTensor

from zeroband.utils.logging import get_logger

Expand Down Expand Up @@ -105,6 +106,10 @@ def get_tensor_signature(a: torch.Tensor | torch.nn.Parameter) -> str:
"""
while isinstance(a, torch.nn.Parameter):
a = a.data

if isinstance(a, DTensor):
a = a.full_tensor()

if a.numel() < TENSOR_SIG_SAMPLE_SIZE:
b = a.as_strided(size=(a.numel(),), stride=(1,))
else:
Expand Down

0 comments on commit c7605c8

Please sign in to comment.