Merge branch 'main' into improvements-to-logging

discovery-unicamp · May 13, 2024 · 6dd8ceb · 6dd8ceb
2 parents 49b17f1 + c85b6e6
commit 6dd8ceb
Show file tree

Hide file tree

Showing 10 changed files with 1,088 additions and 15 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -1,13 +1,19 @@
 # This workflow will install Python dependencies, run tests and lint with a single version of Python
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: Python application
+name: Continuous Testing
 
 on:
   push:
     branches: ["main"]
+    paths-ignore:
+      - '**/README.md'
+      - '**/LICENSE'
   pull_request:
     branches: ["main"]
+    paths-ignore:
+      - '**/README.md'
+      - '**/LICENSE'
 
 permissions:
   contents: read

diff --git a/README.md b/README.md
@@ -1,4 +1,7 @@
 # Minerva
+[![Continuous Test](https://github.com/discovery-unicamp/Minerva/actions/workflows/python-app.yml/badge.svg)](https://github.com/discovery-unicamp/Minerva/actions/workflows/python-app.yml)
+
+
 
 Minerva is a framework for training machine learning models for researchers.
 
@@ -26,7 +29,7 @@ from minerva.analysis.metrics import PixelAccuracy
 
 ## License
 
-This project is licensed under the MIT License. See the LICENSE file for details.
+This project is licensed under the MIT License. See the [LICENSE](https://github.com/discovery-unicamp/Minerva/blob/main/LICENSE) file for details.
 
 ## Contact
 

diff --git a/minerva/models/nets/setr.py b/minerva/models/nets/setr.py
@@ -241,6 +241,8 @@ def __init__(
         conv_norm: nn.Module,
         conv_act: nn.Module,
         align_corners: bool,
+        aux_output: bool = False,
+        aux_output_layers: list[int] | None = None,
     ):
         """
         Initializes the SETR PUP model.
@@ -286,6 +288,15 @@ def __init__(
 
         """
         super().__init__()
+        if aux_output:
+            assert aux_output_layers is not None, "aux_output_layers must be provided."
+            assert (
+                len(aux_output_layers) == 3
+            ), "aux_output_layers must have 3 values. Only 3 aux heads are supported."
+
+        self.aux_output = aux_output
+        self.aux_output_layers = aux_output_layers
+
         self.encoder = _VisionTransformerBackbone(
             image_size=image_size,
             patch_size=patch_size,
@@ -295,6 +306,8 @@ def __init__(
             mlp_dim=mlp_dim,
             num_classes=num_classes,
             dropout=encoder_dropout,
+            aux_output=aux_output,
+            aux_output_layers=aux_output_layers,
         )
 
         self.decoder = _SETRUPHead(
@@ -357,15 +370,19 @@ def __init__(
             norm_layer=norm_layer,
         )
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    def forward(self, x: torch.Tensor):
+
+        if self.aux_output:
+            x, aux_results = self.encoder(x)
+            x_aux1 = self.aux_head1(aux_results[0])
+            x_aux2 = self.aux_head2(aux_results[1])
+            x_aux3 = self.aux_head3(aux_results[2])
+            x = self.decoder(x)
+            return x, x_aux1, x_aux2, x_aux3
+
         x = self.encoder(x)
-        # x_aux1 = self.aux_head1(x)
-        # x_aux2 = self.aux_head2(x)
-        # x_aux3 = self.aux_head3(x)
         x = self.decoder(x)
-        return x, torch.zeros(1), torch.zeros(1), torch.zeros(1)
+        return x
 
 
 class MetricTypeSetR(Enum):
@@ -422,6 +439,9 @@ def __init__(
         val_metrics: Optional[nn.Module] = None,
         log_test_metrics: bool = False,
         test_metrics: Optional[nn.Module] = None,
+        aux_output: bool = True,
+        aux_output_layers: list[int] | None = [9, 14, 19],
+        aux_weights: list[float] = [0.3, 0.3, 0.3],
     ):
         """
         Initializes the SetR model.
@@ -479,7 +499,18 @@ def __init__(
             conv_norm if conv_norm is not None else nn.SyncBatchNorm(decoder_channels)
         )
         conv_act = conv_act if conv_act is not None else nn.ReLU()
+
+        if aux_output:
+            assert aux_output_layers is not None, "aux_output_layers must be provided."
+            assert (
+                len(aux_output_layers) == 3
+            ), "aux_output_layers must have 3 values. Only 3 aux heads are supported."
+            assert len(aux_weights) == len(
+                aux_output_layers
+            ), "aux_weights must have the same length as aux_output_layers."
+
         self.num_classes = num_classes
+        self.aux_weights = aux_weights
 
         self.log_train_metrics = log_train_metrics
         self.log_val_metrics = log_val_metrics
@@ -522,6 +553,8 @@ def __init__(
             norm_layer=norm_layer,
             interpolate_mode=interpolate_mode,
             align_corners=align_corners,
+            aux_output=aux_output,
+            aux_output_layers=aux_output_layers,
         )
 
         self.train_step_outputs = []
@@ -536,7 +569,13 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.model(x)
 
-    def _loss_func(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    def _loss_func(
+        self,
+        y_hat: (
+            torch.Tensor | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        ),
+        y: torch.Tensor,
+    ) -> torch.Tensor:
         """Calculate the loss between the output and the input data.
 
         Parameters
@@ -551,6 +590,18 @@ def _loss_func(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         torch.Tensor
             The loss value.
         """
+        if isinstance(y_hat, tuple):
+            y_hat, y_aux1, y_aux2, y_aux3 = y_hat
+            loss = self.loss_fn(y_hat, y.long())
+            loss_aux1 = self.loss_fn(y_aux1, y.long())
+            loss_aux2 = self.loss_fn(y_aux2, y.long())
+            loss_aux3 = self.loss_fn(y_aux3, y.long())
+            return (
+                loss
+                + (loss_aux1 * self.aux_weights[0])
+                + (loss_aux2 * self.aux_weights[1])
+                + (loss_aux3 * self.aux_weights[2])
+            )
         loss = self.loss_fn(y_hat, y.long())
         return loss
 

diff --git a/minerva/models/nets/vit.py b/minerva/models/nets/vit.py
@@ -8,11 +8,69 @@
 from torchvision.models.vision_transformer import (
     Conv2dNormActivation,
     ConvStemConfig,
-    Encoder,
+    EncoderBlock,
     _log_api_usage_once,
 )
 
 
+class _Encoder(nn.Module):
+    """Transformer Model Encoder for sequence to sequence translation."""
+
+    def __init__(
+        self,
+        seq_length: int,
+        num_layers: int,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        dropout: float,
+        attention_dropout: float,
+        aux_output: bool = False,
+        aux_output_layers: List[int] | None = None,
+        norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
+    ):
+        super().__init__()
+        # Note that batch_size is on the first dim because
+        # we have batch_first=True in nn.MultiAttention() by default
+
+        self.aux_output = aux_output
+        self.aux_output_layers = aux_output_layers
+
+        self.pos_embedding = nn.Parameter(
+            torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)
+        )  # from BERT
+        self.dropout = nn.Dropout(dropout)
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
+        for i in range(num_layers):
+            layers[f"encoder_layer_{i}"] = EncoderBlock(
+                num_heads,
+                hidden_dim,
+                mlp_dim,
+                dropout,
+                attention_dropout,
+                norm_layer,
+            )
+        self.layers = nn.Sequential(layers)
+        self.ln = norm_layer(hidden_dim)
+
+    def forward(self, input: torch.Tensor):
+        torch._assert(
+            input.dim() == 3,
+            f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
+        )
+        input = input + self.pos_embedding
+
+        if self.aux_output:
+            aux_outputs = []
+            for i, layer in enumerate(self.layers):
+                input = layer(input)
+                if i in self.aux_output_layers:  # type: ignore
+                    aux_outputs.append(self.ln(self.dropout(input)))
+            return self.ln(self.dropout(input)), aux_outputs
+
+        return self.ln(self.layers(self.dropout(input)))
+
+
 class _VisionTransformerBackbone(nn.Module):
     """Vision Transformer as per https://arxiv.org/abs/2010.11929."""
 
@@ -27,6 +85,8 @@ def __init__(
         dropout: float = 0.0,
         attention_dropout: float = 0.0,
         num_classes: int = 1000,
+        aux_output: bool = False,
+        aux_output_layers: List[int] | None = None,
         norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
         conv_stem_configs: Optional[List[ConvStemConfig]] = None,
     ):
@@ -65,6 +125,12 @@ def __init__(
         super().__init__()
         _log_api_usage_once(self)
 
+        if aux_output:
+            assert aux_output_layers is not None
+            assert all(
+                0 <= i < num_layers for i in aux_output_layers
+            ), "Invalid layer index in aux_output_layers"
+
         if isinstance(image_size, int):
             torch._assert(
                 image_size % patch_size == 0, "Input shape indivisible by patch size!"
@@ -83,6 +149,8 @@ def __init__(
         self.dropout = dropout
         self.num_classes = num_classes
         self.norm_layer = norm_layer
+        self.aux_output = aux_output
+        self.aux_output_layers = aux_output_layers
 
         if conv_stem_configs is not None:
             # As per https://arxiv.org/abs/2106.14881
@@ -125,7 +193,7 @@ def __init__(
         self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
         seq_length += 1
 
-        self.encoder = Encoder(
+        self.encoder = _Encoder(
             seq_length=seq_length,
             num_layers=num_layers,
             num_heads=num_heads,
@@ -134,6 +202,8 @@ def __init__(
             dropout=dropout,
             attention_dropout=attention_dropout,
             norm_layer=norm_layer,
+            aux_output=aux_output,
+            aux_output_layers=aux_output_layers,
         )
         self.seq_length = seq_length
 
@@ -196,6 +266,7 @@ def _process_input(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
         n_w = w // p
 
         # (n, c, h, w) -> (n, hidden_dim, n_h, n_w)
+        x = x.to(torch.float32)
         x = self.conv_proj(x)
         # (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w))
         x = x.reshape(n, self.hidden_dim, n_h * n_w)
@@ -225,6 +296,22 @@ def forward(self, x: torch.Tensor):
         batch_class_token = self.class_token.expand(n, -1, -1)
         x = torch.cat([batch_class_token, x], dim=1)
 
+        if self.aux_output:
+            x, aux_outputs = self.encoder(x)
+            x = x[:, 1:]
+            B, _, C = x.shape
+            x = x.reshape(B, n_h, n_w, C).permute(0, 3, 1, 2).contiguous()
+            for i, aux_output in enumerate(aux_outputs):
+                aux_outputs[i] = aux_output[:, 1:]
+                B, _, C = aux_outputs[i].shape
+                aux_outputs[i] = (
+                    aux_outputs[i]
+                    .reshape(B, n_h, n_w, C)
+                    .permute(0, 3, 1, 2)
+                    .contiguous()
+                )
+            return x, aux_outputs
+
         x = self.encoder(x)
 
         # Classifier "token" as used by standard language architectures