Skip to content

Commit

Permalink
Removed unnecessary class parameters, added
Browse files Browse the repository at this point in the history
. Added profiler to track GPU and CPU usage during training of the network
  • Loading branch information
LovePelmeni committed Jun 13, 2024
1 parent a69d1cd commit fe3fb87
Showing 1 changed file with 26 additions and 30 deletions.
56 changes: 26 additions & 30 deletions src/training/trainers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,11 @@ def __init__(self,
max_epochs: int,
batch_size: int,
optimizer: nn.Module,
checkpoint_dir: str,
output_weights_dir: str,
log_dir: str,
save_every: int,
scheduler: nn.Module = None,
train_device: typing.Literal['cpu', 'cuda', 'mps'] = 'cpu',
loader_num_workers: int = 1,
label_smoothing_eps: float = 0,
distributed: bool = False,
reproducible: bool = False,
seed: int = None
Expand Down Expand Up @@ -110,18 +107,31 @@ def __init__(self,
min_diff=minimum_metric_difference
)

self.label_smoother = regularization.LabelSmoothing(
etta=label_smoothing_eps)

self.loss_function = loss_function
self.eval_metric = eval_metric

self.loader_num_workers = loader_num_workers
self.save_every = save_every

self.checkpoint_dir = pathlib.Path(checkpoint_dir)
self.output_weights_dir = pathlib.Path(output_weights_dir)
self.log_dir = pathlib.Path(log_dir)
self.device_utilization_dir = pathlib.Path(
os.path.join(
log_dir,
"device_utilization"
)
)
self.checkpoint_dir = pathlib.Path(
os.path.join(
log_dir,
"checkpoints"
)
)
self.output_weights_dir = pathlib.Path(
os.path.join(
log_dir,
"weights"
)
)

self.network_writer = SummaryWriter( # logging dir should preferably contain (name of the experiment,version,timestamp)
log_dir=os.path.join(log_dir, "network_params"),
Expand All @@ -132,6 +142,7 @@ def __init__(self,
os.makedirs(self.checkpoint_dir, exist_ok=True)
os.makedirs(self.output_weights_dir, exist_ok=True)
os.makedirs(self.log_dir, exist_ok=True)
os.makedirs(self.device_utilization_dir, exist_ok=True)

def freeze_layers(self, layers):
"""
Expand Down Expand Up @@ -341,34 +352,16 @@ def train(self, train_dataset: data.Dataset):
desc='EPOCH %s, BEST_LOSS: %s, CURR_LOSS: %s, EVAL METRIC: %s ' % (
(epoch+1), best_loss, current_loss, best_eval_metric)):

probs = self.network.to(self.train_device).forward(
imgs.clone().detach().to(self.train_device))

cpu_probs = probs.cpu()
gpu_imgs = imgs.to(self.train_device)
raw_logits = self.network.forward(gpu_imgs)

# computing one hot distribution
one_hots = torch.zeros_like(cpu_probs)
loss = self.loss_function(raw_logits, classes)

for idx, class_ in enumerate(classes):
one_hots[idx][classes[idx]] = 1

smoothed_one_hots = torch.as_tensor(
self.label_smoother(one_hots))

loss = self.loss_function(cpu_probs, smoothed_one_hots)

epoch_loss.append(loss.item())
epoch_loss.append(loss.detach().cpu().item())

loss.backward()
self.optimizer.step()

# flushing output cache
cpu_probs.zero_()

# clearing up dynamic memory
gc.collect()
torch.cuda.empty_cache()

# tracking distributions of network parameters
global_step += 1

Expand Down Expand Up @@ -451,3 +444,6 @@ def evaluate(self, validation_dataset: data.Dataset, slicing=False):
output_classes
)
return metric



0 comments on commit fe3fb87

Please sign in to comment.