forked from Om-Alve/smolGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_fine10.py
52 lines (46 loc) · 1.59 KB
/
config_fine10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from dataclasses import dataclass
import torch
import math
@dataclass
class GPTConfig:
block_size: int = 2048
vocab_size: int = 12000
n_layer: int = 12
n_head: int = 12
n_embed: int = 768
dropout: float = 0.1
bias: bool = False
use_rotary: bool = True
@dataclass
class TrainingConfig:
batch_size: int = 48
learning_rate: float = 3e-4
max_iters: int = 100000
weight_decay: float = 1e-1
beta1: float = 0.9
beta2: float = 0.95
grad_clip: float = 1.0
decay_lr: bool = True
warmup_iters: int = 4000
lr_decay_iters: int = 100000
min_lr: float = 3e-5
eval_interval: int = 500
log_interval: int = 10
eval_iters: int = 200
gradient_accumulation_steps: int = 5
device: str = str(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
dtype: str = "bfloat16"
compile: bool = True
def get_lr(self, it: int) -> float:
"""Get learning rate at iteration it according to schedule."""
# 1) Linear warmup for warmup_iters steps
if it < self.warmup_iters:
return self.learning_rate * it / self.warmup_iters
# 2) If it > lr_decay_iters, return min learning rate
if it > self.lr_decay_iters:
return self.min_lr
# 3) In between, use cosine decay down to min learning rate
decay_ratio = (it - self.warmup_iters) / (self.lr_decay_iters - self.warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return self.min_lr + coeff * (self.learning_rate - self.min_lr)