-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
286 lines (248 loc) · 15.3 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
from other_utils.anchor_cluster import change_cfg_file_anchors
from torch.utils.tensorboard import SummaryWriter
from build_utils.utils import check_file
from train_utils import kaist_train_eval_utils as train_util
from train_utils import get_coco_api_from_dataset
from build_utils.torch_utils import select_device
from build_utils.kaist_dataset import *
from models import *
import torch.optim.lr_scheduler as lr_scheduler
import torch.optim as optim
import datetime
import argparse
import yaml
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
def train(hyp):
# 1、选择训练设备信息
device = select_device(opt.device)
# 2、设置训练结果文件的相关路径
weight_best_file = "weights/{}_best.pt".format(opt.name)
results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# Remove previous results 移除先前的训练结果记录指标文件
# glob的主要作用是用来查找符合特定规则的文件路径名
# for f in glob.glob(results_file):
# os.remove(f)
# 3、初始化训练设置信息
cfg = opt.cfg
data = opt.data
epochs = opt.epochs
batch_size = opt.batch_size
accumulate = max(round(64 / batch_size), 1) # accumulate n times before optimizer update (bs 64)
weights = opt.weights # initial training weights
imgsz_train = opt.img_size # 训练时输入图像大小
imgsz_test = opt.img_size # test image sizes
multi_scale = opt.multi_scale
if opt.anchor_cluster:
# 自动对训练集边界框进行聚类并回写到模型配置文件中
change_cfg_file_anchors(cfg)
# 4、设置多尺度训练相关的参数
# 图像要设置成32的倍数,我们默认的输出图像大小512是32的倍数
gs = 32 # (pixels) grid size
assert math.fmod(imgsz_test, gs) == 0, "--img-size {} must be a {}-multiple".format(imgsz_test, gs)
grid_min, grid_max = imgsz_test // gs, imgsz_test // gs # 计算网格数
if multi_scale: # 若使用多尺度训练(即输入图像大小不一)
imgsz_min = opt.img_size // 1.5
imgsz_max = opt.img_size // 0.667
# 将给定的最大,最小输入尺寸向下调整到32的整数倍
grid_min, grid_max = imgsz_min // gs, imgsz_max // gs # 网格最小、最大尺寸
imgsz_min, imgsz_max = int(grid_min * gs), int(grid_max * gs) # 图像最小、最大尺寸
imgsz_train = imgsz_max # initialize with max size
print("Using multi_scale training, image range({}, {})".format(imgsz_min, imgsz_max))
# 5、设置训练相关参数
# init_seeds() # 初始化随机种子,保证结果可复现
# 解析xx.data配置文件,从中提取训练图像文件路径、验证图像文件路径和数据集类别数
data_dict = parse_data_cfg(data)
train_path = data_dict["train"]
test_path = data_dict["valid"]
nc = 1 if opt.single_cls else int(data_dict["classes"]) # number of classes
# 下面几个损失函数权重系数的调参挺有用的
hyp["cls"] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset
hyp["obj"] *= imgsz_test / 320
print(f"hyp['box']: {hyp['box']:0.3f}, hyp['obj']: {hyp['obj']:0.3f}. hyp['cls']: {hyp['cls']:0.3f},"
f" {('CIoU Loss' if 'ciou' in hyp else 'GIoU Loss')}")
# 6、创建网络模型对象,冻结部分网络结构的权重参数
model = YOLO(cfg).to(device)
if opt.freeze_layers >= 0:
# 将Double-YOLO-Kaist的前149层(即Darknet特征提取网络部分)的参数冻结
darknet_end_layers = opt.freeze_layers # 对于dyolov3而言是默认是149
for idx in range(darknet_end_layers + 1):
for parameter in model.module_list[idx].parameters():
parameter.requires_grad_(False)
# 7、创建优化器
pg = [p for p in model.parameters() if p.requires_grad]
if opt.sgd:
optimizer = optim.SGD(pg, lr=hyp["lr0"], momentum=hyp["momentum"],
weight_decay=hyp["weight_decay"], nesterov=True)
else:
optimizer = optim.Adam(pg, lr=hyp["lr0"], betas=(hyp['momentum'], 0.999),
weight_decay=hyp['weight_decay'])
# 8、加载网络权重,使用权重文件中记录的数据初始化相关变量
start_epoch = 0
best_map = 0.0
if weights.endswith(".pt") or weights.endswith(".pth"):
print("load dict model weights from '{}'".format(weights))
ckpt = torch.load(weights, map_location=device)
# 尝试加载网络模型权重参数
try:
ckpt["model"] = {k: v for k, v in ckpt["model"].items()
if k in model.state_dict() and model.state_dict()[k].numel() == v.numel()}
miss, unexpected = model.load_state_dict(ckpt["model"], strict=False)
except KeyError as e:
s = "{} is not compatible with {}. Specify --weights '' or specify a --cfg compatible with {}. " \
.format(opt.weights, opt.cfg, opt.weights)
raise KeyError(s) from e
# 尝试加载训练时所有的优化器参数
if ckpt["optimizer"] is not None:
# optimizer.load_state_dict(ckpt["optimizer"])
if "best_map" in ckpt.keys():
best_map = ckpt["best_map"]
# 尝试加载先前训练得到的结果,并将其写入到结果文本文件中
if ckpt.get("training_results") is not None:
with open(results_file, "w") as file:
file.write(ckpt["training_results"]) # write results.txt
# 获取先前训练到的轮次数,方便后续继续训练
start_epoch = ckpt["epoch"] + 1
if epochs < start_epoch:
print('{} has been trained for {} epochs. Fine-tuning for {} additional epochs.'
.format(opt.weights, ckpt['epoch'], epochs))
epochs += ckpt['epoch'] # finetune additional epochs
del ckpt
elif weights.endswith(".weights"):
print("load binary model weights from '{}'".format(weights))
load_darknet_weights(model, weights, cutoff=opt.cutoff)
# 9、创建学习率自动调整器并作出相关初始化
# Scheduler https://arxiv.org/pdf/1812.01187.pdf
lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp["lrf"]) + hyp["lrf"] # cosine
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
scheduler.last_epoch = start_epoch # 指定从哪个epoch开始
# 10、加载训练数据集和验证数据集,并初始化加载器
# 训练集的图像尺寸指定为multi_scale_range中最大的尺寸
train_dataset = LoadKaistImagesAndLabels(train_path, imgsz_train, batch_size,
augment=True,
hyp=hyp, # augmentation hyperparameters
rect=opt.rect, # rectangular training 默认为False
snowflake=opt.snow,
single_cls=opt.single_cls)
# 验证集的图像尺寸指定为img_size(512)
val_dataset = LoadKaistImagesAndLabels(test_path, imgsz_test, batch_size,
hyp=hyp,
rect=True,
snowflake=False,
single_cls=opt.single_cls)
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers
train_dataloader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size,
num_workers=nw,
# Shuffle=True unless rectangular training is used
shuffle=not opt.rect,
pin_memory=True,
collate_fn=train_dataset.collate_fn)
val_datasetloader = torch.utils.data.DataLoader(val_dataset,
batch_size=batch_size,
num_workers=nw,
pin_memory=True,
collate_fn=val_dataset.collate_fn)
# 11、设置模型对象的其他剩余成员变量
model.nc = nc # attach number of classes to model
model.hyp = hyp # attach hyperparameters to model
model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou)
coco = get_coco_api_from_dataset(val_dataset)
# 12、根据剩余训练轮次数继续或者开始网络模型对象的训练
left_epoches = epochs - start_epoch
print("starting training for {} epochs, left {} epoches...".format(epochs, left_epoches))
print('Using {} dataloader workers'.format(nc))
for epoch in range(start_epoch, epochs):
# 训练一个轮次,并从中获取训练过程中计算得到的平均损失值,和当前学习率
mloss, lr = train_util.train_one_epoch(model, optimizer, train_dataloader,
device, epoch,
accumulate=accumulate, # 迭代多少batch才训练完64张图片
img_size=imgsz_train, # 输入图像的大小
multi_scale=multi_scale,
grid_min=grid_min, # grid的最小尺寸
grid_max=grid_max, # grid的最大尺寸
gs=gs, # grid step: 32
print_freq=50, # 每训练多少个step打印一次信息
warmup=True)
# update scheduler
scheduler.step()
# 13、对网络进行验证测试,并记录coco性能指标
if opt.notest is False or epoch == epochs - 1:
# evaluate on the test dataset
result_info = train_util.evaluate(model, val_datasetloader,
coco=coco, device=device)
coco_mAP = result_info[0] # (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]
voc_mAP = result_info[1] # (AP) @[ IoU=0.50 | area= all | maxDets=100 ]
coco_mAR = result_info[8] # (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]
# 将测试得到的性能指标数据记录到tensorboard中
if tb_writer:
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', 'train/loss', "learning_rate",
"mAP@[IoU=0.50:0.95]", "mAP@[IoU=0.5]", "mAR@[IoU=0.50:0.95]"]
for x, tag in zip(mloss.tolist() + [lr, coco_mAP, voc_mAP, coco_mAR], tags):
tb_writer.add_scalar(tag, x, epoch)
# 将训练得到的性能指标数据记录到结果文本文件中
with open(results_file, "a") as f:
# 记录coco的12个指标加上训练总损失和lr
result_info = [str(round(i, 4)) for i in result_info + [mloss.tolist()[-1]]] + [str(round(lr, 6))]
txt = "epoch:{} {}".format(epoch, ' '.join(result_info))
f.write(txt + "\n")
# 14、如果当前训练得到的网络模型性能最佳,那么就将网络模型的权重参数记录到pt文件中
# update best mAP(IoU=0.50:0.95)
if coco_mAP > best_map:
best_map = coco_mAP
if not opt.save_best: # 每轮次都记录一下网络模型权重参数
with open(results_file, 'r') as f:
save_files = {
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'training_results': f.read(),
'epoch': epoch,
'best_map': best_map}
torch.save(save_files, "./weights/{}-{}.pt".format(opt.name, epoch))
else: # 只记录最佳性能指标时的网络模型权重参数
if best_map == coco_mAP:
with open(results_file, 'r') as f:
save_files = {
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'training_results': f.read(),
'epoch': epoch,
'best_map': best_map}
torch.save(save_files, weight_best_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# 下面几个参数是我们重点需要配置的参数
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=2)
parser.add_argument('--hyp', type=str, default='config/hyp.scratch.4.yaml', help='hyperparameters path')
parser.add_argument('--cfg', type=str, default='config/kaist_dyolov4_mobilenetv2_fshare_global_cse3.cfg', help="*.cfg path")
parser.add_argument('--weights', type=str, default='weights/pretrained_dyolov4_mobilenetv2_fshare_global_cse3.pt', help='initial weights path')
parser.add_argument('--name', default='kaist_dyolov4_mobilenetv2_fshare_global_cse3', help='renames results.txt to results_name.txt if supplied')
parser.add_argument('--freeze-layers', type=int, default=-1,
help='Freeze feature extract layers, -1 means no layers will be froze')
parser.add_argument('--device', default='cuda:0', help='device id (i.e. 0 or 0,1 or cpu)')
# 临时启用的程序参数
parser.add_argument('--cutoff', type=int, default=104, help="model weights cutoff, when loading files with the suffix '.weights'")
parser.add_argument('--snow', action='store_true', help='use snowflake change to process images')
# 下面几个参数几乎不需要改动
parser.add_argument('--anchor-cluster', action='store_true', help="use cluster algorithm to set anchors")
parser.add_argument('--sgd', action='store_true', help='use torch.optim.SGD() optimizer')
parser.add_argument('--single-cls', type=bool, default=True, help='train as single-class dataset')
parser.add_argument('--data', type=str, default='data/kaist_data.data', help='*.data path')
parser.add_argument('--multi-scale', type=bool, default=True,
help='adjust (67%% - 150%%) img_size every 10 batches')
parser.add_argument('--img-size', type=int, default=512, help='test size')
parser.add_argument('--rect', action='store_true', help='rectangular training') # 不要开启矩形变换,因为矩形变换的代码有错误
parser.add_argument('--save-best', type=bool, default=True, help='only save best checkpoint')
parser.add_argument('--notest', action='store_true', help='only test final epoch')
opt = parser.parse_args()
# 检查文件是否存在
opt.cfg = check_file(opt.cfg)
opt.data = check_file(opt.data)
opt.hyp = check_file(opt.hyp)
print(opt)
with open(opt.hyp, encoding='utf-8') as f:
hyp = yaml.load(f, Loader=yaml.FullLoader)
print('Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/')
tb_writer = SummaryWriter(comment=opt.name)
train(hyp)