Skip to content

Commit

Permalink
Merge branch 'main' into improvements-to-logging
Browse files Browse the repository at this point in the history
  • Loading branch information
GabrielBG0 authored May 13, 2024
2 parents 49b17f1 + c85b6e6 commit 6dd8ceb
Show file tree
Hide file tree
Showing 10 changed files with 1,088 additions and 15 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python application
name: Continuous Testing

on:
push:
branches: ["main"]
paths-ignore:
- '**/README.md'
- '**/LICENSE'
pull_request:
branches: ["main"]
paths-ignore:
- '**/README.md'
- '**/LICENSE'

permissions:
contents: read
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Minerva
[![Continuous Test](https://github.com/discovery-unicamp/Minerva/actions/workflows/python-app.yml/badge.svg)](https://github.com/discovery-unicamp/Minerva/actions/workflows/python-app.yml)



Minerva is a framework for training machine learning models for researchers.

Expand Down Expand Up @@ -26,7 +29,7 @@ from minerva.analysis.metrics import PixelAccuracy

## License

This project is licensed under the MIT License. See the LICENSE file for details.
This project is licensed under the MIT License. See the [LICENSE](https://github.com/discovery-unicamp/Minerva/blob/main/LICENSE) file for details.

## Contact

Expand Down
67 changes: 59 additions & 8 deletions minerva/models/nets/setr.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ def __init__(
conv_norm: nn.Module,
conv_act: nn.Module,
align_corners: bool,
aux_output: bool = False,
aux_output_layers: list[int] | None = None,
):
"""
Initializes the SETR PUP model.
Expand Down Expand Up @@ -286,6 +288,15 @@ def __init__(
"""
super().__init__()
if aux_output:
assert aux_output_layers is not None, "aux_output_layers must be provided."
assert (
len(aux_output_layers) == 3
), "aux_output_layers must have 3 values. Only 3 aux heads are supported."

self.aux_output = aux_output
self.aux_output_layers = aux_output_layers

self.encoder = _VisionTransformerBackbone(
image_size=image_size,
patch_size=patch_size,
Expand All @@ -295,6 +306,8 @@ def __init__(
mlp_dim=mlp_dim,
num_classes=num_classes,
dropout=encoder_dropout,
aux_output=aux_output,
aux_output_layers=aux_output_layers,
)

self.decoder = _SETRUPHead(
Expand Down Expand Up @@ -357,15 +370,19 @@ def __init__(
norm_layer=norm_layer,
)

def forward(
self, x: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
def forward(self, x: torch.Tensor):

if self.aux_output:
x, aux_results = self.encoder(x)
x_aux1 = self.aux_head1(aux_results[0])
x_aux2 = self.aux_head2(aux_results[1])
x_aux3 = self.aux_head3(aux_results[2])
x = self.decoder(x)
return x, x_aux1, x_aux2, x_aux3

x = self.encoder(x)
# x_aux1 = self.aux_head1(x)
# x_aux2 = self.aux_head2(x)
# x_aux3 = self.aux_head3(x)
x = self.decoder(x)
return x, torch.zeros(1), torch.zeros(1), torch.zeros(1)
return x


class MetricTypeSetR(Enum):
Expand Down Expand Up @@ -422,6 +439,9 @@ def __init__(
val_metrics: Optional[nn.Module] = None,
log_test_metrics: bool = False,
test_metrics: Optional[nn.Module] = None,
aux_output: bool = True,
aux_output_layers: list[int] | None = [9, 14, 19],
aux_weights: list[float] = [0.3, 0.3, 0.3],
):
"""
Initializes the SetR model.
Expand Down Expand Up @@ -479,7 +499,18 @@ def __init__(
conv_norm if conv_norm is not None else nn.SyncBatchNorm(decoder_channels)
)
conv_act = conv_act if conv_act is not None else nn.ReLU()

if aux_output:
assert aux_output_layers is not None, "aux_output_layers must be provided."
assert (
len(aux_output_layers) == 3
), "aux_output_layers must have 3 values. Only 3 aux heads are supported."
assert len(aux_weights) == len(
aux_output_layers
), "aux_weights must have the same length as aux_output_layers."

self.num_classes = num_classes
self.aux_weights = aux_weights

self.log_train_metrics = log_train_metrics
self.log_val_metrics = log_val_metrics
Expand Down Expand Up @@ -522,6 +553,8 @@ def __init__(
norm_layer=norm_layer,
interpolate_mode=interpolate_mode,
align_corners=align_corners,
aux_output=aux_output,
aux_output_layers=aux_output_layers,
)

self.train_step_outputs = []
Expand All @@ -536,7 +569,13 @@ def __init__(
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.model(x)

def _loss_func(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
def _loss_func(
self,
y_hat: (
torch.Tensor | Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
),
y: torch.Tensor,
) -> torch.Tensor:
"""Calculate the loss between the output and the input data.
Parameters
Expand All @@ -551,6 +590,18 @@ def _loss_func(self, y_hat: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
torch.Tensor
The loss value.
"""
if isinstance(y_hat, tuple):
y_hat, y_aux1, y_aux2, y_aux3 = y_hat
loss = self.loss_fn(y_hat, y.long())
loss_aux1 = self.loss_fn(y_aux1, y.long())
loss_aux2 = self.loss_fn(y_aux2, y.long())
loss_aux3 = self.loss_fn(y_aux3, y.long())
return (
loss
+ (loss_aux1 * self.aux_weights[0])
+ (loss_aux2 * self.aux_weights[1])
+ (loss_aux3 * self.aux_weights[2])
)
loss = self.loss_fn(y_hat, y.long())
return loss

Expand Down
91 changes: 89 additions & 2 deletions minerva/models/nets/vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,69 @@
from torchvision.models.vision_transformer import (
Conv2dNormActivation,
ConvStemConfig,
Encoder,
EncoderBlock,
_log_api_usage_once,
)


class _Encoder(nn.Module):
"""Transformer Model Encoder for sequence to sequence translation."""

def __init__(
self,
seq_length: int,
num_layers: int,
num_heads: int,
hidden_dim: int,
mlp_dim: int,
dropout: float,
attention_dropout: float,
aux_output: bool = False,
aux_output_layers: List[int] | None = None,
norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
):
super().__init__()
# Note that batch_size is on the first dim because
# we have batch_first=True in nn.MultiAttention() by default

self.aux_output = aux_output
self.aux_output_layers = aux_output_layers

self.pos_embedding = nn.Parameter(
torch.empty(1, seq_length, hidden_dim).normal_(std=0.02)
) # from BERT
self.dropout = nn.Dropout(dropout)
layers: OrderedDict[str, nn.Module] = OrderedDict()
for i in range(num_layers):
layers[f"encoder_layer_{i}"] = EncoderBlock(
num_heads,
hidden_dim,
mlp_dim,
dropout,
attention_dropout,
norm_layer,
)
self.layers = nn.Sequential(layers)
self.ln = norm_layer(hidden_dim)

def forward(self, input: torch.Tensor):
torch._assert(
input.dim() == 3,
f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}",
)
input = input + self.pos_embedding

if self.aux_output:
aux_outputs = []
for i, layer in enumerate(self.layers):
input = layer(input)
if i in self.aux_output_layers: # type: ignore
aux_outputs.append(self.ln(self.dropout(input)))
return self.ln(self.dropout(input)), aux_outputs

return self.ln(self.layers(self.dropout(input)))


class _VisionTransformerBackbone(nn.Module):
"""Vision Transformer as per https://arxiv.org/abs/2010.11929."""

Expand All @@ -27,6 +85,8 @@ def __init__(
dropout: float = 0.0,
attention_dropout: float = 0.0,
num_classes: int = 1000,
aux_output: bool = False,
aux_output_layers: List[int] | None = None,
norm_layer: Callable[..., torch.nn.Module] = partial(nn.LayerNorm, eps=1e-6),
conv_stem_configs: Optional[List[ConvStemConfig]] = None,
):
Expand Down Expand Up @@ -65,6 +125,12 @@ def __init__(
super().__init__()
_log_api_usage_once(self)

if aux_output:
assert aux_output_layers is not None
assert all(
0 <= i < num_layers for i in aux_output_layers
), "Invalid layer index in aux_output_layers"

if isinstance(image_size, int):
torch._assert(
image_size % patch_size == 0, "Input shape indivisible by patch size!"
Expand All @@ -83,6 +149,8 @@ def __init__(
self.dropout = dropout
self.num_classes = num_classes
self.norm_layer = norm_layer
self.aux_output = aux_output
self.aux_output_layers = aux_output_layers

if conv_stem_configs is not None:
# As per https://arxiv.org/abs/2106.14881
Expand Down Expand Up @@ -125,7 +193,7 @@ def __init__(
self.class_token = nn.Parameter(torch.zeros(1, 1, hidden_dim))
seq_length += 1

self.encoder = Encoder(
self.encoder = _Encoder(
seq_length=seq_length,
num_layers=num_layers,
num_heads=num_heads,
Expand All @@ -134,6 +202,8 @@ def __init__(
dropout=dropout,
attention_dropout=attention_dropout,
norm_layer=norm_layer,
aux_output=aux_output,
aux_output_layers=aux_output_layers,
)
self.seq_length = seq_length

Expand Down Expand Up @@ -196,6 +266,7 @@ def _process_input(self, x: torch.Tensor) -> tuple[torch.Tensor, int, int]:
n_w = w // p

# (n, c, h, w) -> (n, hidden_dim, n_h, n_w)
x = x.to(torch.float32)
x = self.conv_proj(x)
# (n, hidden_dim, n_h, n_w) -> (n, hidden_dim, (n_h * n_w))
x = x.reshape(n, self.hidden_dim, n_h * n_w)
Expand Down Expand Up @@ -225,6 +296,22 @@ def forward(self, x: torch.Tensor):
batch_class_token = self.class_token.expand(n, -1, -1)
x = torch.cat([batch_class_token, x], dim=1)

if self.aux_output:
x, aux_outputs = self.encoder(x)
x = x[:, 1:]
B, _, C = x.shape
x = x.reshape(B, n_h, n_w, C).permute(0, 3, 1, 2).contiguous()
for i, aux_output in enumerate(aux_outputs):
aux_outputs[i] = aux_output[:, 1:]
B, _, C = aux_outputs[i].shape
aux_outputs[i] = (
aux_outputs[i]
.reshape(B, n_h, n_w, C)
.permute(0, 3, 1, 2)
.contiguous()
)
return x, aux_outputs

x = self.encoder(x)

# Classifier "token" as used by standard language architectures
Expand Down
Loading

0 comments on commit 6dd8ceb

Please sign in to comment.