-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrainer.py
75 lines (72 loc) · 3.6 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import torch
from transformations.copy_paste import SimpleCopyPaste, copy_paste
from tqdm import tqdm
import random
import math
class Trainer:
def __init__(self, model, train_loader, optimizer, lr_scheduler, device, learning_rate):
self.model = model
self.train_loader = train_loader
self.device = device
self.learning_rate = learning_rate
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
self.copy_paste = SimpleCopyPaste()
def train_one_epoch(self, epoch, criterion=None, setup_scheduler=False):
self.model.train()
epoch_loss = 0 # Initialize the total loss for this epoch
epoch_loss_model: dict = {}
progress_bar = tqdm(total=len(self.train_loader), desc="Training") # Initialize a progress bar
# Dynamic learning rate
self.lr_scheduler = None
if epoch == 1 and setup_scheduler:
warmup_factor = 1.0 / 1000 # 0.001
warmup_iters = min(1000, len(self.train_loader) - 1) # Number of epochs needed to get to 0.001
self.lr_scheduler = torch.optim.lr_scheduler.LinearLR(
self.optimizer, start_factor=warmup_factor, total_iters=warmup_iters
)
for batch_idx, batch in enumerate(self.train_loader):
# Batch is a pair (images, targets), where:
# - images is a tensor representing the images contained inside a batch, so has size equal to batch_size
# - targets is a list of targets, i.e. dictionaries associated to the images inside the batch
images: torch.Tensor = batch[0]
targets: list = batch[1]
# Apply copy and paste from the 201st epoch with probability 25%
p = random.random()
if (p > 0.75 and epoch > 200):
images = torch.unbind(images, dim=0)
res_images, res_targets = copy_paste(images, targets)
images = torch.stack(res_images, dim=0)
targets = res_targets
images = images.to(self.device)
targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]
loss_dict: dict = self.model(images, targets)
losses = sum(loss for loss in loss_dict.values()) # Sum the losses
# Update the total loss of the current epoch
loss_value = losses.item()
epoch_loss += loss_value
# Update the total loss for each loss in the model
for key, value in loss_dict.items():
if key not in epoch_loss_model:
epoch_loss_model[key] = 0
epoch_loss_model[key] += value.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict)
self.optimizer.zero_grad()
losses.backward()
self.optimizer.step()
if self.lr_scheduler is not None:
self.lr_scheduler.step()
# Summary so update the progress bar
progress_bar_dict = dict(curr_epoch=epoch, curr_batch=batch_idx, loss=loss_value,
avg_loss=epoch_loss / len(self.train_loader))
if setup_scheduler:
progress_bar_dict.update(lr=self.lr_scheduler.get_last_lr()[0])
progress_bar.set_postfix(progress_bar_dict)
progress_bar.update()
# End the training of this epoch
avg_loss = epoch_loss / len(self.train_loader)
# Cleanup and close the progress bar
progress_bar.close()
return avg_loss, epoch_loss_model, self.lr_scheduler