-
Notifications
You must be signed in to change notification settings - Fork 1
/
dnn_models.py
256 lines (215 loc) · 12.2 KB
/
dnn_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
import torch.optim as optim
import pandas as pd
import numpy as np
from torchvision.models import resnet34
class xvecTDNN(nn.Module):
"""
TDNN as defined by https://www.danielpovey.com/files/2015_interspeech_multisplice.pdf
Context size and dilation determine the frames selected, that is:
context size 5 and dilation 1 is equivalent to [-2,-1,0,1,2]
context size 3 and dilation 2 is equivalent to [-2, 0, 2]
context size 1 and dilation 1 is equivalent to [0]
"""
def __init__(self, input_dim, n_classes, p_dropout=0):
super(xvecTDNN, self).__init__()
self.tdnn1 = nn.Conv1d(in_channels=input_dim, out_channels=512, kernel_size=5, dilation=1)
self.bn_tdnn1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_tdnn1 = nn.Dropout(p=p_dropout)
self.tdnn2 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=5, dilation=2)
self.bn_tdnn2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_tdnn2 = nn.Dropout(p=p_dropout)
self.tdnn3 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=7, dilation=3)
self.bn_tdnn3 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_tdnn3 = nn.Dropout(p=p_dropout)
self.tdnn4 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=1, dilation=1)
self.bn_tdnn4 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_tdnn4 = nn.Dropout(p=p_dropout)
self.tdnn5 = nn.Conv1d(in_channels=512, out_channels=1500, kernel_size=1, dilation=1)
self.bn_tdnn5 = nn.BatchNorm1d(1500, momentum=0.1, affine=False)
self.dropout_tdnn5 = nn.Dropout(p=p_dropout)
self.fc1 = nn.Linear(3000, 512)
self.bn_fc1 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_fc1 = nn.Dropout(p=p_dropout)
self.fc2 = nn.Linear(512, 512)
self.bn_fc2 = nn.BatchNorm1d(512, momentum=0.1, affine=False)
self.dropout_fc2 = nn.Dropout(p=p_dropout)
self.fc3 = nn.Linear(512, n_classes)
def forward(self, x, eps):
# Note: x must be (batch_size, feat_dim, chunk_len)
x = self.dropout_tdnn1(self.bn_tdnn1(F.relu(self.tdnn1(x))))
x = self.dropout_tdnn2(self.bn_tdnn2(F.relu(self.tdnn2(x))))
x = self.dropout_tdnn3(self.bn_tdnn3(F.relu(self.tdnn3(x))))
x = self.dropout_tdnn4(self.bn_tdnn4(F.relu(self.tdnn4(x))))
x = self.dropout_tdnn5(self.bn_tdnn5(F.relu(self.tdnn5(x))))
if self.training:
shape = x.size()
noise = torch.cuda.FloatTensor(shape)
torch.randn(shape, out=noise)
x += noise * eps
stats = torch.cat((x.mean(dim=2), x.std(dim=2)), dim=1)
x = self.dropout_fc1(self.bn_fc1(F.relu(self.fc1(stats))))
x = self.dropout_fc2(self.bn_fc2(F.relu(self.fc2(x))))
x = self.fc3(x)
return x
# https://nbviewer.jupyter.org/github/IliaZenkov/transformer_cnn_parallel_audio_classification/blob/main/notebooks/Parallel_is_All_You_Want.ipynb#Conclusion
class TransformerPrime(nn.Module):
def __init__(self, input_dim, net_output, p_dropout):
super().__init__()
################ TRANSFORMER BLOCK #############################
# maxpool the input feature map/tensor to the transformer
# a rectangular kernel worked better here for the rectangular input spectrogram feature map/tensor
self.transformer_maxpool = nn.MaxPool2d(kernel_size=[1, 4], stride=[1, 4])
# define single transformer encoder layer
# self-attention + feedforward network from "Attention is All You Need" paper
# 4 multi-head self-attention layers each with 40-->512--->40 feedforward network
transformer_layer = nn.TransformerEncoderLayer(
d_model=input_dim, # input feature (frequency) dim after max-pooling 40*282 -> 40*70 (MFC*time)
nhead=4, # 4 self-attention layers in each multi-head self-attention layer in each encoder block
dim_feedforward=512, # 2 linear layers in each encoder block's feedforward network: dim 40-->512--->40
dropout=p_dropout,
activation='relu' # ReLU: avoid saturation/tame gradient/reduce compute time
)
# Using 4 instead of the 6 identical stacked encoder layers used in Attention is All You Need paper
# Complete transformer block contains 4 full transformer encoder layers
# (each w/ multi-head self-attention+feedforward)
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=4)
############### 1ST PARALLEL 2D CONVOLUTION BLOCK ############
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
self.conv2Dblock1 = nn.Sequential(
# 1st 2D convolution layer
nn.Conv2d(
in_channels=1, # input volume depth == input channel dim == 1
out_channels=16, # expand output feature map volume's depth to 16
kernel_size=3, # typical 3*3 stride 1 kernel
stride=1,
padding=1
),
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
nn.ReLU(), # feature map --> activation map
nn.MaxPool2d(kernel_size=2, stride=2), # typical maxpool kernel size
nn.Dropout(p=0.3), # randomly zero 30% of 1st layer's output feature map in training
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
nn.Conv2d(
in_channels=16,
out_channels=32, # expand output feature map volume's depth to 32
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
nn.Dropout(p=0.3),
# 3rd 2D convolution layer identical to last except output dim
nn.Conv2d(
in_channels=32,
out_channels=64, # expand output feature map volume's depth to 64
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
)
############### 2ND PARALLEL 2D CONVOLUTION BLOCK ############
# 3 sequential conv2D layers: (1,40,282) --> (16, 20, 141) -> (32, 5, 35) -> (64, 1, 8)
self.conv2Dblock2 = nn.Sequential(
# 1st 2D convolution layer
nn.Conv2d(
in_channels=1, # input volume depth == input channel dim == 1
out_channels=16, # expand output feature map volume's depth to 16
kernel_size=3, # typical 3*3 stride 1 kernel
stride=1,
padding=1
),
nn.BatchNorm2d(16), # batch normalize the output feature map before activation
nn.ReLU(), # feature map --> activation map
nn.MaxPool2d(kernel_size=2, stride=2), # typical maxpool kernel size
nn.Dropout(p=0.3), # randomly zero 30% of 1st layer's output feature map in training
# 2nd 2D convolution layer identical to last except output dim, maxpool kernel
nn.Conv2d(
in_channels=16,
out_channels=32, # expand output feature map volume's depth to 32
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4), # increase maxpool kernel for subsequent filters
nn.Dropout(p=0.3),
# 3rd 2D convolution layer identical to last except output dim
nn.Conv2d(
in_channels=32,
out_channels=64, # expand output feature map volume's depth to 64
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
)
################# FINAL LINEAR BLOCK ####################
# Linear softmax layer to take final concatenated embedding tensor
# from parallel 2D convolutional and transformer blocks, output 8 logits
# Each full convolution block outputs (64*1*8) embedding flattened to dim 512 1D array
# Full transformer block outputs 40*70 feature map, which we time-avg to dim 40 1D array
# 512*2+40 == 1064 input features --> 8 output emotions
# self.fc1_linear = nn.Linear(512 * 2 + 40, num_classes)
self.fc1_linear = nn.Linear(424, net_output)
### Softmax layer for the 8 output logits from final FC linear layer
# self.softmax_out = nn.Softmax(dim=1) # dim==1 is the freq embedding
self.last_out = nn.Sigmoid()
# define one complete parallel fwd pass of input feature tensor thru 2*conv+1*transformer blocks
def forward(self, x):
# 1st parallel Conv2D block: 4 Convolutional layers ############################
# create final feature embedding from 1st convolutional layer
# input features passed through 4 sequential 2D convolutional layers
conv2d_embedding1 = self.conv2Dblock1(x) # x == N/batch * channel * freq * time
# flatten final 64*1*8 feature map from convolutional layers to length 512 1D array
# skip the 1st (N/batch) dimension when flattening
conv2d_embedding1 = torch.flatten(conv2d_embedding1, start_dim=1)
# 2nd parallel Conv2D block: 4 Convolutional layers #############################
# create final feature embedding from 2nd convolutional layer
# input features pased through 4 sequential 2D convolutional layers
conv2d_embedding2 = self.conv2Dblock2(x) # x == N/batch * channel * freq * time
# flatten final 64*1*8 feature map from convolutional layers to length 512 1D array
# skip the 1st (N/batch) dimension when flattening
conv2d_embedding2 = torch.flatten(conv2d_embedding2, start_dim=1)
# 4-encoder-layer Transformer block w/ 40-->512-->40 feedfwd network ##############
# maxpool input feature map: 1*40*282 w/ 1*4 kernel --> 1*40*70
x_maxpool = self.transformer_maxpool(x)
# remove channel dim: 1*40*70 --> 40*70
x_maxpool_reduced = torch.squeeze(x_maxpool, 1)
# convert maxpooled feature map format: batch * freq * time ---> time * batch * freq format
# because transformer encoder layer requires tensor in format: time * batch * embedding (freq)
x = x_maxpool_reduced.permute(2, 0, 1)
# finally, pass reduced input feature map x into transformer encoder layers
transformer_output = self.transformer_encoder(x)
# create final feature emedding from transformer layer by taking mean in the time dimension (now the 0th dim)
# transformer outputs 2x40 (MFCC embedding*time) feature map, take mean of columns i.e. take time average
transformer_embedding = torch.mean(transformer_output, dim=0) # dim 40x70 --> 40
# concatenate freq embeddings from convolutional and transformer blocks ######
# concatenate embedding tensors output by parallel 2*conv and 1*transformer blocks
# print(conv2d_embedding1.shape)
# print(conv2d_embedding2.shape)
# print(transformer_embedding.shape)
complete_embedding = torch.cat([conv2d_embedding1, conv2d_embedding2, transformer_embedding], dim=1)
# print(complete_embedding.shape)
# final FC linear layer, need logits for loss #########################
output_logits = self.fc1_linear(complete_embedding)
# Final Softmax layer: use logits from FC linear, get softmax for prediction ######
# output_softmax = self.softmax_out(output_logits)
output = self.last_out(output_logits)
# need output logits to compute cross entropy loss, need softmax probabilities to predict class
return output_logits, output