2020 TogetherClient ,
2121 TogetherRequest ,
2222 TrainingType ,
23+ FinetuneLRScheduler ,
24+ FinetuneLinearLRSchedulerArgs ,
2325)
2426from together .types .finetune import DownloadCheckpointType
2527from together .utils import log_warn_once , normalize_key
@@ -35,7 +37,10 @@ def createFinetuneRequest(
3537 n_checkpoints : int | None = 1 ,
3638 batch_size : int | Literal ["max" ] = "max" ,
3739 learning_rate : float | None = 0.00001 ,
38- warmup_ratio : float | None = 0.0 ,
40+ min_lr_ratio : float = 0.0 ,
41+ warmup_ratio : float = 0.0 ,
42+ max_grad_norm : float = 1.0 ,
43+ weight_decay : float = 0.0 ,
3944 lora : bool = False ,
4045 lora_r : int | None = None ,
4146 lora_dropout : float | None = 0 ,
@@ -83,6 +88,20 @@ def createFinetuneRequest(
8388 if warmup_ratio > 1 or warmup_ratio < 0 :
8489 raise ValueError ("Warmup ratio should be between 0 and 1" )
8590
91+ if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0 ):
92+ raise ValueError ("Min learning rate ratio should be between 0 and 1" )
93+
94+ if max_grad_norm < 0 :
95+ raise ValueError ("Max gradient norm should be non-negative" )
96+
97+ if weight_decay is not None and (weight_decay < 0 ):
98+ raise ValueError ("Weight decay should be non-negative" )
99+
100+ lrScheduler = FinetuneLRScheduler (
101+ lr_scheduler_type = "linear" ,
102+ lr_scheduler_args = FinetuneLinearLRSchedulerArgs (min_lr_ratio = min_lr_ratio ),
103+ )
104+
86105 finetune_request = FinetuneRequest (
87106 model = model ,
88107 training_file = training_file ,
@@ -92,7 +111,10 @@ def createFinetuneRequest(
92111 n_checkpoints = n_checkpoints ,
93112 batch_size = batch_size ,
94113 learning_rate = learning_rate ,
114+ lr_scheduler = lrScheduler ,
95115 warmup_ratio = warmup_ratio ,
116+ max_grad_norm = max_grad_norm ,
117+ weight_decay = weight_decay ,
96118 training_type = training_type ,
97119 suffix = suffix ,
98120 wandb_key = wandb_api_key ,
@@ -117,7 +139,10 @@ def create(
117139 n_checkpoints : int | None = 1 ,
118140 batch_size : int | Literal ["max" ] = "max" ,
119141 learning_rate : float | None = 0.00001 ,
120- warmup_ratio : float | None = 0.0 ,
142+ min_lr_ratio : float = 0.0 ,
143+ warmup_ratio : float = 0.0 ,
144+ max_grad_norm : float = 1.0 ,
145+ weight_decay : float = 0.0 ,
121146 lora : bool = False ,
122147 lora_r : int | None = None ,
123148 lora_dropout : float | None = 0 ,
@@ -143,7 +168,11 @@ def create(
143168 batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
144169 learning_rate (float, optional): Learning rate multiplier to use for training
145170 Defaults to 0.00001.
171+ min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
172+ the learning rate scheduler. Defaults to 0.0.
146173 warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
174+ max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
175+ weight_decay (float, optional): Weight decay. Defaults to 0.0.
147176 lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
148177 lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
149178 lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -185,7 +214,10 @@ def create(
185214 n_checkpoints = n_checkpoints ,
186215 batch_size = batch_size ,
187216 learning_rate = learning_rate ,
217+ min_lr_ratio = min_lr_ratio ,
188218 warmup_ratio = warmup_ratio ,
219+ max_grad_norm = max_grad_norm ,
220+ weight_decay = weight_decay ,
189221 lora = lora ,
190222 lora_r = lora_r ,
191223 lora_dropout = lora_dropout ,
@@ -436,7 +468,10 @@ async def create(
436468 n_checkpoints : int | None = 1 ,
437469 batch_size : int | Literal ["max" ] = "max" ,
438470 learning_rate : float | None = 0.00001 ,
439- warmup_ratio : float | None = 0.0 ,
471+ min_lr_ratio : float = 0.0 ,
472+ warmup_ratio : float = 0.0 ,
473+ max_grad_norm : float = 1.0 ,
474+ weight_decay : float = 0.0 ,
440475 lora : bool = False ,
441476 lora_r : int | None = None ,
442477 lora_dropout : float | None = 0 ,
@@ -462,7 +497,11 @@ async def create(
462497 batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
463498 learning_rate (float, optional): Learning rate multiplier to use for training
464499 Defaults to 0.00001.
500+ min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
501+ the learning rate scheduler. Defaults to 0.0.
465502 warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
503+ max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
504+ weight_decay (float, optional): Weight decay. Defaults to 0.0.
466505 lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
467506 lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
468507 lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -504,7 +543,10 @@ async def create(
504543 n_checkpoints = n_checkpoints ,
505544 batch_size = batch_size ,
506545 learning_rate = learning_rate ,
546+ min_lr_ratio = min_lr_ratio ,
507547 warmup_ratio = warmup_ratio ,
548+ max_grad_norm = max_grad_norm ,
549+ weight_decay = weight_decay ,
508550 lora = lora ,
509551 lora_r = lora_r ,
510552 lora_dropout = lora_dropout ,
0 commit comments