From 61cc35e2c78649e32402cc5f3718204970a5a615 Mon Sep 17 00:00:00 2001 From: Johannes Hagemann Date: Mon, 7 Oct 2024 02:35:19 +0000 Subject: [PATCH] comment out print memory stuff --- configs/10B/H100.toml | 6 +++--- configs/13B/H100.toml | 25 +++++++++++++++++++++++++ src/zeroband/train.py | 14 +++++++------- 3 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 configs/13B/H100.toml diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml index 9c92b99f..9a0edf67 100644 --- a/configs/10B/H100.toml +++ b/configs/10B/H100.toml @@ -2,12 +2,12 @@ name_model = "10B" project = "johannes_debug" [train] -micro_bs = 2 +micro_bs = 4 sharding_strategy = "SHARD_GRAD_OP" -ac_ckpt = true +# ac_ckpt = true # torch_compile = false [train.memory_profiler] -freq = 100 +freq = 200 snapshot_dir = "./snapshots" [optim] diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml new file mode 100644 index 00000000..f3e854d7 --- /dev/null +++ b/configs/13B/H100.toml @@ -0,0 +1,25 @@ +name_model = "13B" +project = "johannes_debug" + +[train] +micro_bs = 1 +sharding_strategy = "SHARD_GRAD_OP" +ac_ckpt = true +# torch_compile = false +[train.memory_profiler] +freq = 200 +snapshot_dir = "./snapshots" + +[optim] +batch_size = 128 #2M tokens bs +warmup_steps = 1000 +total_steps = 88_000 +lr = 3e-4 + +[data] +seq_length = 2048 +fake = true + +# [ckpt] +# interval = 10 +# path = "./ckpt" \ No newline at end of file diff --git a/src/zeroband/train.py b/src/zeroband/train.py index 2cde85d6..68ac2952 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -245,21 +245,21 @@ def train(config: Config): model.set_requires_gradient_sync(not is_accumulating) - # Profile memory before moving tensors to GPU - print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") + # Profile memory before moving tensors to GPU + # print(f"Memory before moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") input_ids = batch["input_ids"].to("cuda") labels = batch["labels"].to("cuda") # Profile memory after moving tensors to GPU - print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") - print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") + # print(f"Memory after moving tensors to GPU: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") + # print(f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") # with model.no_sync() if is_accumulating else nullcontext(): logits = model(tokens=input_ids).contiguous() # Profile memory after the forward pass - print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") + # print(f"Memory after forward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") flatten_logits = rearrange(logits, "b seq vocab -> (b seq) vocab") flatten_labels = rearrange(labels, "b seq -> (b seq)") @@ -272,7 +272,7 @@ def train(config: Config): loss_batch += loss.detach() # Profile memory after backward pass - print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") + # print(f"Memory after backward pass: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB") clip_grad_norm_(model.parameters(), 1.0) # gradient clipping # model.clip_grad_norm_(1.0) @@ -363,7 +363,7 @@ def train(config: Config): if config.train.memory_monitor: logger.info(f"outer step peak gpu stats: {gpu_mem_monitor.format_peak_states()}") - print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") + # print(f"Max memory allocated so far: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") if training_progress.step >= config.optim.total_steps: # we only allow to break outisde of the inner loop.