Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WSD LR Scheduler #67

Merged
merged 3 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/zeroband/lr_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from torch.optim.lr_scheduler import LambdaLR
from functools import partial
import math
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup


def _get_linear_schedule_with_wsd_sqrt_lr_lambda(current_step: int, *, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
elif current_step < num_stable_steps:
return 1.0
else:
return max(0.0, 1 - math.sqrt(float(current_step - num_stable_steps) / float(num_training_steps - num_stable_steps)))

def get_linear_schedule_with_wsd_sqrt(optimizer, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int, last_epoch: int=-1):
"""
Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.

Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""

lr_lambda = partial(
_get_linear_schedule_with_wsd_sqrt_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_stable_steps=num_stable_steps,
num_training_steps=num_training_steps,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)

SCHED_MAP = {
"cosine": get_cosine_schedule_with_warmup,
"wsd-sqrt": get_linear_schedule_with_wsd_sqrt,
"linear": get_linear_schedule_with_warmup
}

def get_scheduler(sched_type: str, optimizer, num_warmup_steps: int, num_stable_steps: int, num_training_steps: int):
if 'wsd' in sched_type:
return SCHED_MAP[sched_type](optimizer, num_warmup_steps=num_warmup_steps, num_stable_steps=num_stable_steps, num_training_steps=num_training_steps)
else:
return SCHED_MAP[sched_type](optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
14 changes: 8 additions & 6 deletions src/zeroband/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@
from einops import rearrange
from torch.nn import functional as F

from transformers import (
AutoTokenizer,
get_cosine_schedule_with_warmup,
)
from transformers import AutoTokenizer

from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy

Expand All @@ -34,6 +31,7 @@
from zeroband.utils.world_info import get_world_info
from zeroband.utils.logging import get_logger
from zeroband.checkpoint import CkptManager, TrainingProgress
from zeroband.lr_scheduler import get_scheduler


class OptimConfig(BaseConfig):
Expand All @@ -42,7 +40,9 @@ class OptimConfig(BaseConfig):
adam_betas1: float = 0.9
adam_betas2: float = 0.95

sched_type: Literal["cosine", "linear", "wsd-sqrt"] = "cosine"
warmup_steps: int = 1000
stable_steps: int = 80_000
total_steps: int = 88_000
batch_size: int = 512

Expand Down Expand Up @@ -214,9 +214,11 @@ def train(config: Config):
if config.diloco is not None:
diloco = Diloco(config.diloco, model, elastic_device_mesh)

scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
scheduler = get_scheduler(
sched_type=config.optim.sched_type,
optimizer=inner_optimizer,
num_warmup_steps=config.optim.warmup_steps,
num_stable_steps=config.optim.stable_steps,
num_training_steps=config.optim.total_steps,
)

Expand Down