comment out print memory stuff

PrimeIntellect-ai · Oct 7, 2024 · 61cc35e · 61cc35e
1 parent 5c01232
commit 61cc35e
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 10 deletions.
diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
@@ -2,12 +2,12 @@ name_model = "10B"
 project = "johannes_debug"
 
 [train]
-micro_bs = 2
+micro_bs = 4
 sharding_strategy = "SHARD_GRAD_OP"
-ac_ckpt = true
+# ac_ckpt = true
 # torch_compile = false
 [train.memory_profiler]
-freq = 100
+freq = 200
 snapshot_dir = "./snapshots"
 
 [optim]

diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml
@@ -0,0 +1,25 @@
+name_model = "13B"
+project = "johannes_debug"
+
+[train]
+micro_bs = 1
+sharding_strategy = "SHARD_GRAD_OP"
+ac_ckpt = true
+# torch_compile = false
+[train.memory_profiler]
+freq = 200
+snapshot_dir = "./snapshots"
+
+[optim]
+batch_size = 128 #2M tokens bs
+warmup_steps = 1000
+total_steps = 88_000
+lr = 3e-4
+
+[data]
+seq_length = 2048
+fake = true
+
+# [ckpt]
+# interval = 10
+# path = "./ckpt"
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -245,21 +245,21 @@ def train(config: Config):
 
                 model.set_requires_gradient_sync(not is_accumulating)
 
-                                # Profile memory before moving tensors to GPU
-                print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # Profile memory before moving tensors to GPU
+                # print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 
                 input_ids = batch["input_ids"].to("cuda")
                 labels = batch["labels"].to("cuda")
 
                 # Profile memory after moving tensors to GPU
-                print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
-                print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
 
                 # with model.no_sync() if is_accumulating else nullcontext():
                 logits = model(tokens=input_ids).contiguous()
 
                 # Profile memory after the forward pass
-                print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 
                 flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab")
                 flatten_labels = rearrange(labels, "b seq -> (b seq)")
@@ -272,7 +272,7 @@ def train(config: Config):
                 loss_batch += loss.detach()
 
                 # Profile memory after backward pass
-                print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 
             clip_grad_norm_(model.parameters(), 1.0) # gradient clipping
             # model.clip_grad_norm_(1.0)  
@@ -363,7 +363,7 @@ def train(config: Config):
         if config.train.memory_monitor:
             logger.info(f"outer step peak gpu stats: {gpu_mem_monitor.format_peak_states()}")
 
-        print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+        # print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
 
         if training_progress.step >= config.optim.total_steps:
             # we only allow to break outisde of the inner loop.