PrimeIntellect-ai · Jackmin801 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/src/zeroband/lr_scheduler.py b/src/zeroband/lr_scheduler.py
@@ -0,0 +1,52 @@
+from torch.optim.lr_scheduler import LambdaLR
+from functools import partial
+import math
+from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
+
+
+def _get_linear_schedule_with_wsd_sqrt_lr_lambda(current_step: int, *, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    elif current_step < num_stable_steps:
+        return 1.0
+    else:
+        return max(0.0, 1 - math.sqrt(float(current_step - num_stable_steps) / float(num_training_steps - num_stable_steps)))
+
+def get_linear_schedule_with_wsd_sqrt(optimizer, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int, last_epoch: int=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    lr_lambda = partial(
+        _get_linear_schedule_with_wsd_sqrt_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_stable_steps=num_stable_steps,
+        num_training_steps=num_training_steps,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+SCHED_MAP = {
+    "cosine": get_cosine_schedule_with_warmup,
+    "wsd-sqrt": get_linear_schedule_with_wsd_sqrt,
+    "linear": get_linear_schedule_with_warmup
+}
+
+def get_scheduler(sched_type: str, optimizer, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int):
+    if 'wsd' in sched_type:
+        return SCHED_MAP[sched_type](optimizer, num_warmup_steps=num_warmup_steps, num_stable_steps=num_stable_steps, num_training_steps=num_training_steps)
+    else:
+        return SCHED_MAP[sched_type](optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -8,10 +8,7 @@
 from einops import rearrange
 from torch.nn import functional as F
 
-from transformers import (
-    AutoTokenizer,
-    get_cosine_schedule_with_warmup,
-)
+from transformers import AutoTokenizer
 
 from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
 
@@ -34,6 +31,7 @@
 from zeroband.utils.world_info import get_world_info
 from zeroband.utils.logging import get_logger
 from zeroband.checkpoint import CkptManager, TrainingProgress
+from zeroband.lr_scheduler import get_scheduler
 
 
 class OptimConfig(BaseConfig):
@@ -42,7 +40,9 @@ class OptimConfig(BaseConfig):
     adam_betas1: float = 0.9
     adam_betas2: float = 0.95
 
+    sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
     warmup_steps: int = 1000
+    stable_steps: int = 80_000
     total_steps: int = 88_000
     batch_size: int = 512
 
@@ -214,9 +214,11 @@ def train(config: Config):
     if config.diloco is not None:
         diloco = Diloco(config.diloco, model, elastic_device_mesh)
 
-    scheduler = get_cosine_schedule_with_warmup(
-        inner_optimizer,
+    scheduler = get_scheduler(
+        sched_type=config.optim.sched_type,
+        optimizer=inner_optimizer,
         num_warmup_steps=config.optim.warmup_steps,
+        num_stable_steps=config.optim.stable_steps,
         num_training_steps=config.optim.total_steps,
     )