.

jedmccaleb · jedmccaleb · commit 9fb12d51bf93 · 2023-04-22T08:33:29.000-07:00
diff --git a/configs/125M.yml b/configs/125M.yml
@@ -51,6 +51,7 @@
    # batch / data settings
    "train_micro_batch_size_per_gpu": 4,
    "data-impl": "mmap",
+   "gradient_accumulation_steps": 1,
 
    # activation checkpointing
    "checkpoint-activations": true,
diff --git a/configs/local_setup.yml b/configs/local_setup.yml
@@ -1,5 +1,6 @@
 # Suggested data paths when using GPT-NeoX locally
 {
+  "global_num_gpus":1,
   "data-path": "data/enwik8/enwik8_text_document",
 
   # or for weighted datasets:
@@ -24,7 +25,7 @@
 
   "tensorboard-dir": "tensorboard",
   "log-dir": "logs",
-  "use_wandb": True,
+  "use_wandb": False,
   "wandb_host": "https://api.wandb.ai",
   "wandb_project": "neox"
 }
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -941,8 +941,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     """
 
     gas: int = None
-    """gradient_accumulation_steps"""  # TODO this is a duplicate, remove?
-
+    gradient_accumulation_steps: int = 1
     clip_grad: float = None
     """
     Gradient clipping based on global L2 norm.

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`# Suggested data paths when using GPT-NeoX locally`
`2`	`2`	`{`
	`3`	`+ "global_num_gpus":1,`
`3`	`4`	`"data-path": "data/enwik8/enwik8_text_document",`
`4`	`5`
`5`	`6`	`# or for weighted datasets:`
`@@ -24,7 +25,7 @@`
`24`	`25`
`25`	`26`	`"tensorboard-dir": "tensorboard",`
`26`	`27`	`"log-dir": "logs",`
`27`		`- "use_wandb": True,`
	`28`	`+ "use_wandb": False,`
`28`	`29`	`"wandb_host": "https://api.wandb.ai",`
`29`	`30`	`"wandb_project": "neox"`
`30`	`31`	`}`