Skip to content

Commit

Permalink
comment out print memory stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesHa committed Oct 7, 2024
1 parent 5c01232 commit 61cc35e
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 10 deletions.
6 changes: 3 additions & 3 deletions configs/10B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ name_model = "10B"
project = "johannes_debug"

[train]
micro_bs = 2
micro_bs = 4
sharding_strategy = "SHARD_GRAD_OP"
ac_ckpt = true
# ac_ckpt = true
# torch_compile = false
[train.memory_profiler]
freq = 100
freq = 200
snapshot_dir = "./snapshots"

[optim]
Expand Down
25 changes: 25 additions & 0 deletions configs/13B/H100.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name_model = "13B"
project = "johannes_debug"

[train]
micro_bs = 1
sharding_strategy = "SHARD_GRAD_OP"
ac_ckpt = true
# torch_compile = false
[train.memory_profiler]
freq = 200
snapshot_dir = "./snapshots"

[optim]
batch_size = 128 #2M tokens bs
warmup_steps = 1000
total_steps = 88_000
lr = 3e-4

[data]
seq_length = 2048
fake = true

# [ckpt]
# interval = 10
# path = "./ckpt"
14 changes: 7 additions & 7 deletions src/zeroband/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,21 +245,21 @@ def train(config: Config):

model.set_requires_gradient_sync(not is_accumulating)

# Profile memory before moving tensors to GPU
print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
# Profile memory before moving tensors to GPU
# print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")

input_ids = batch["input_ids"].to("cuda")
labels = batch["labels"].to("cuda")

# Profile memory after moving tensors to GPU
print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
# print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
# print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")

# with model.no_sync() if is_accumulating else nullcontext():
logits = model(tokens=input_ids).contiguous()

# Profile memory after the forward pass
print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
# print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")

flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab")
flatten_labels = rearrange(labels, "b seq -> (b seq)")
Expand All @@ -272,7 +272,7 @@ def train(config: Config):
loss_batch += loss.detach()

# Profile memory after backward pass
print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
# print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")

clip_grad_norm_(model.parameters(), 1.0) # gradient clipping
# model.clip_grad_norm_(1.0)
Expand Down Expand Up @@ -363,7 +363,7 @@ def train(config: Config):
if config.train.memory_monitor:
logger.info(f"outer step peak gpu stats: {gpu_mem_monitor.format_peak_states()}")

print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
# print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")

if training_progress.step >= config.optim.total_steps:
# we only allow to break outisde of the inner loop.
Expand Down

0 comments on commit 61cc35e

Please sign in to comment.