Removed unnecessary class parameters, added

. Added profiler to track GPU and CPU usage during training of the network
LovePelmeni · Jun 13, 2024 · fe3fb87 · fe3fb87
1 parent a69d1cd
commit fe3fb87
Showing 1 changed file with 26 additions and 30 deletions.
diff --git a/src/training/trainers/models.py b/src/training/trainers/models.py
@@ -60,14 +60,11 @@ def __init__(self,
                  max_epochs: int,
                  batch_size: int,
                  optimizer: nn.Module,
-                 checkpoint_dir: str,
-                 output_weights_dir: str,
                  log_dir: str,
                  save_every: int,
                  scheduler: nn.Module = None,
                  train_device: typing.Literal['cpu', 'cuda', 'mps'] = 'cpu',
                  loader_num_workers: int = 1,
-                 label_smoothing_eps: float = 0,
                  distributed: bool = False,
                  reproducible: bool = False,
                  seed: int = None
@@ -110,18 +107,31 @@ def __init__(self,
             min_diff=minimum_metric_difference
         )
 
-        self.label_smoother = regularization.LabelSmoothing(
-            etta=label_smoothing_eps)
-
         self.loss_function = loss_function
         self.eval_metric = eval_metric
 
         self.loader_num_workers = loader_num_workers
         self.save_every = save_every
 
-        self.checkpoint_dir = pathlib.Path(checkpoint_dir)
-        self.output_weights_dir = pathlib.Path(output_weights_dir)
         self.log_dir = pathlib.Path(log_dir)
+        self.device_utilization_dir = pathlib.Path(
+            os.path.join(
+                log_dir, 
+                "device_utilization"
+            )
+        )
+        self.checkpoint_dir = pathlib.Path(
+            os.path.join(
+                log_dir, 
+                "checkpoints"
+            )
+        )
+        self.output_weights_dir = pathlib.Path(
+            os.path.join(
+                log_dir, 
+                "weights"
+            )
+        )
 
         self.network_writer = SummaryWriter(  # logging dir should preferably contain (name of the experiment,version,timestamp)
             log_dir=os.path.join(log_dir, "network_params"),
@@ -132,6 +142,7 @@ def __init__(self,
         os.makedirs(self.checkpoint_dir, exist_ok=True)
         os.makedirs(self.output_weights_dir, exist_ok=True)
         os.makedirs(self.log_dir, exist_ok=True)
+        os.makedirs(self.device_utilization_dir, exist_ok=True)
 
     def freeze_layers(self, layers):
         """
@@ -341,34 +352,16 @@ def train(self, train_dataset: data.Dataset):
                 desc='EPOCH %s, BEST_LOSS: %s, CURR_LOSS: %s, EVAL METRIC: %s ' % (
                     (epoch+1), best_loss, current_loss, best_eval_metric)):
 
-                probs = self.network.to(self.train_device).forward(
-                    imgs.clone().detach().to(self.train_device))
-
-                cpu_probs = probs.cpu()
+                gpu_imgs = imgs.to(self.train_device)
+                raw_logits = self.network.forward(gpu_imgs)
 
-                # computing one hot distribution
-                one_hots = torch.zeros_like(cpu_probs)
+                loss = self.loss_function(raw_logits, classes)
 
-                for idx, class_ in enumerate(classes):
-                    one_hots[idx][classes[idx]] = 1
-
-                smoothed_one_hots = torch.as_tensor(
-                    self.label_smoother(one_hots))
-
-                loss = self.loss_function(cpu_probs, smoothed_one_hots)
-
-                epoch_loss.append(loss.item())
+                epoch_loss.append(loss.detach().cpu().item())
 
                 loss.backward()
                 self.optimizer.step()
 
-                # flushing output cache
-                cpu_probs.zero_()
-
-                # clearing up dynamic memory
-                gc.collect()
-                torch.cuda.empty_cache()
-
                 # tracking distributions of network parameters
                 global_step += 1
 
@@ -451,3 +444,6 @@ def evaluate(self, validation_dataset: data.Dataset, slicing=False):
                     output_classes
                 )
                 return metric
+
+
+