From 61cc35e2c78649e32402cc5f3718204970a5a615 Mon Sep 17 00:00:00 2001
From: Johannes Hagemann <johannes@primeintellect.ai>
Date: Mon, 7 Oct 2024 02:35:19 +0000
Subject: [PATCH] comment out print memory stuff

---
 configs/10B/H100.toml |  6 +++---
 configs/13B/H100.toml | 25 +++++++++++++++++++++++++
 src/zeroband/train.py | 14 +++++++-------
 3 files changed, 35 insertions(+), 10 deletions(-)
 create mode 100644 configs/13B/H100.toml

diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
index 9c92b99f..9a0edf67 100644
--- a/configs/10B/H100.toml
+++ b/configs/10B/H100.toml
@@ -2,12 +2,12 @@ name_model = "10B"
 project = "johannes_debug"
 
 [train]
-micro_bs = 2
+micro_bs = 4
 sharding_strategy = "SHARD_GRAD_OP"
-ac_ckpt = true
+# ac_ckpt = true
 # torch_compile = false
 [train.memory_profiler]
-freq = 100
+freq = 200
 snapshot_dir = "./snapshots"
 
 [optim]
diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml
new file mode 100644
index 00000000..f3e854d7
--- /dev/null
+++ b/configs/13B/H100.toml
@@ -0,0 +1,25 @@
+name_model = "13B"
+project = "johannes_debug"
+
+[train]
+micro_bs = 1
+sharding_strategy = "SHARD_GRAD_OP"
+ac_ckpt = true
+# torch_compile = false
+[train.memory_profiler]
+freq = 200
+snapshot_dir = "./snapshots"
+
+[optim]
+batch_size = 128 #2M tokens bs
+warmup_steps = 1000
+total_steps = 88_000
+lr = 3e-4
+
+[data]
+seq_length = 2048
+fake = true
+
+# [ckpt]
+# interval = 10
+# path = "./ckpt"
\ No newline at end of file
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
index 2cde85d6..68ac2952 100644
--- a/src/zeroband/train.py
+++ b/src/zeroband/train.py
@@ -245,21 +245,21 @@ def train(config: Config):
                 
                 model.set_requires_gradient_sync(not is_accumulating)
 
-                                # Profile memory before moving tensors to GPU
-                print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # Profile memory before moving tensors to GPU
+                # print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
                 
                 input_ids = batch["input_ids"].to("cuda")
                 labels = batch["labels"].to("cuda")
                 
                 # Profile memory after moving tensors to GPU
-                print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
-                print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
 
                 # with model.no_sync() if is_accumulating else nullcontext():
                 logits = model(tokens=input_ids).contiguous()
                 
                 # Profile memory after the forward pass
-                print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 
                 flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab")
                 flatten_labels = rearrange(labels, "b seq -> (b seq)")
@@ -272,7 +272,7 @@ def train(config: Config):
                 loss_batch += loss.detach()
 
                 # Profile memory after backward pass
-                print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
+                # print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
 
             clip_grad_norm_(model.parameters(), 1.0) # gradient clipping
             # model.clip_grad_norm_(1.0)  
@@ -363,7 +363,7 @@ def train(config: Config):
         if config.train.memory_monitor:
             logger.info(f"outer step peak gpu stats: {gpu_mem_monitor.format_peak_states()}")
 
-        print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+        # print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
 
         if training_progress.step >= config.optim.total_steps:
             # we only allow to break outisde of the inner loop.