-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
238 lines (204 loc) · 7.98 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env python
# coding=utf-8
"""
This file is modified from the huggingface example for finetuning language models
[run_clm.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py)
"""
import logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import sys
from typing import Optional
from functools import partial
import datasets
import torch
import torch.distributed as dist
import deepspeed
from datasets import load_dataset
from torch.utils.data import Dataset
from dataclasses import dataclass, field
from typing import Optional, List, Union
import transformers
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
HfArgumentParser,
DataCollatorForSeq2Seq,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from sft_trainer import SFTTrainer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class TrainingArguments(transformers.TrainingArguments):
loss: str = field(
default="gem", metadata={"help": "Loss name", "choices": ["gem", "ce"]}
)
gem_beta: float = field(default=0.7, metadata={"help": "Hyper-parameter in GEM."})
gem_h: str = field(
default="logsigmoid", metadata={"help": "Hyper-parameter in GEM.", "choices": ["logsigmoid", "linear"]}
)
@dataclass
class ModelArguments:
model_name_or_path: str = field(
metadata={
"help": "Path to pretrained model or model identifier from huggingface.co/models"
}
)
cache_dir: Optional[str] = field(
default=None,
metadata={
"help": "Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
use_flash_attn: bool = field(
default=True,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
@dataclass
class DataArguments:
train_tokenized_file: str = field(
default=None, metadata={"help": "huggingface dataset name or local data path"}
)
test_tokenized_file: str = field(
default=None, metadata={"help": "huggingface dataset name or local data path"}
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_seq_length: Optional[int] = field(
default=None,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
)
},
)
overwrite_cache: bool = field(
default=False,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
class CustomDataset(Dataset):
def __init__(
self,
training_args,
data_args,
model_args,
train_tokenized_file,
):
self.training_args = training_args
self.data_args = data_args
self.model_args = model_args
raw_datasets = load_dataset(
"json",
data_files=[train_tokenized_file],
cache_dir=self.model_args.cache_dir,
)
self.data = raw_datasets["train"]
if self.data_args.max_train_samples is not None:
max_samples = min(len(self.data), self.data_args.max_train_samples)
self.data = self.data.select(range(max_samples))
def __len__(self):
return len(self.data)
def __getitem__(self, item):
example = self.data[item]
assert "input_ids" in example
assert "labels" in example
example = {k: torch.tensor(v, dtype=torch.long) for k, v in example.items()}
return example
def main():
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(sys.argv[1])
)
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
if training_args.should_log:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers.utils.logging.set_verbosity_info()
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
global_rank = dist.get_rank()
logger.warning(
f"Process rank: {global_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
)
logger.info(f"Training parameters {training_args}")
# Set seed before initializing model.
set_seed(training_args.seed)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
if "llama-3" in tokenizer.name_or_path.lower() and tokenizer.pad_token is None:
tokenizer.pad_token_id = len(tokenizer) - 1
tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
torch_dtype="auto",
attn_implementation=(
"flash_attention_2" if model_args.use_flash_attn else "eager"
),
)
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
# gather deepspeed to get "real" embedding size
embeddings = model.get_input_embeddings()
with deepspeed.zero.GatheredParameters(embeddings.weight, modifier_rank=None):
embedding_size = embeddings.weight.shape[0]
# resize does its own gather
if len(tokenizer) > embedding_size:
# pad to multiple for tensor cores.
logging.warning(f"len(tokenizer) > embedding_size!!! we are resizing...")
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
# set up datasets
train_dataset = CustomDataset(training_args, data_args, model_args, data_args.train_tokenized_file)
if data_args.test_tokenized_file:
test_dataset = CustomDataset(training_args, data_args, model_args, data_args.test_tokenized_file)
else:
test_dataset = None
# initalize a trainer
# here we use a custom trainer that moves the model to CPU when saving the checkpoint in FSDP mode
# we can switch to the default trainer after moving to deepspeed (let's don't change too much for now)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorForSeq2Seq(
tokenizer=tokenizer, model=model, padding="longest"
),
preprocess_logits_for_metrics=None,
compute_metrics=None,
)
# Training
logger.info("*** Train ***")
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
if "llama-3" in model.config.name_or_path.lower() and isinstance(model.generation_config.eos_token_id, int):
model.generation_config.eos_token_id = [128001, 128009]
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
if __name__ == "__main__":
main()