model.py

# Copyright 2018 Antoine Miech All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch as th
from loupe import NetVLAD
import numpy as np
from torch.autograd import Function

class Net(nn.Module):
    def __init__(self, video_modality_dim, text_dim, audio_cluster=8,  text_cluster=32):
        super(Net, self).__init__()
        
        self.audio_pooling = NetVLAD(feature_size=video_modality_dim['audio'][1],
                cluster_size=audio_cluster)
        self.text_pooling = NetVLAD(feature_size=text_dim,
                cluster_size=text_cluster)

        self.mee = MEE(video_modality_dim, self.text_pooling.out_dim)

    def forward(self, text, video, ind, conf=True):

        aggregated_video = {}
        
        aggregated_video['audio'] = self.audio_pooling(video['audio'])
        aggregated_video['face'] = video['face'] 
        aggregated_video['motion'] = video['motion']
        aggregated_video['visual'] = video['visual']
        
        text = self.text_pooling(text)

        return self.mee(text, aggregated_video, ind, conf)

    def get_moe_scores(self, text):

        text = self.text_pooling(text)

        return self.mee.get_moe_scores(text)

class Net2(nn.Module):
    def __init__(self, embd_dim,  video_modality_dim, text_dim, gating=True, text_cluster=32):
        super(Net2, self).__init__()
        
        self.text_pooling = NetVLAD(feature_size=text_dim,
                cluster_size=text_cluster)
        self.embd_text = Gated_Embedding_Unit(self.text_pooling.out_dim, embd_dim, gating=gating)
        self.embd_video = Gated_Embedding_Unit(video_modality_dim, embd_dim,gating=gating)
        self.audio_pooling = NetVLAD(feature_size=128, cluster_size=16)
 
    def forward(self, text, video, conf=True):
        video = th.cat((F.normalize(video['visual']), F.normalize(video['motion']), F.normalize(th.max(video['audio'], dim=1)[0])), dim=1)
        text = self.text_pooling(text)
        text = self.embd_text(text)
        video = self.embd_video(video)
        if conf:
            return th.matmul(text, video.transpose(0, 1))
        else:
            return th.sum(text * video, dim=-1)


class MEE(nn.Module):
    def __init__(self, video_modality_dim, text_dim):
        super(MEE, self).__init__()

        m = list(video_modality_dim.keys())

        self.m = m
        
        self.video_GU = nn.ModuleList([Gated_Embedding_Unit(video_modality_dim[m[i]][0],
            video_modality_dim[m[i]][1]) for i in range(len(m))])

        self.text_GU = nn.ModuleList([Gated_Embedding_Unit(text_dim,
            video_modality_dim[m[i]][1]) for i in range(len(m))])

        self.moe_fc = nn.Linear(text_dim, len(video_modality_dim))
    

    def get_moe_scores(self, text):
        return F.softmax(self.moe_fc(text), dim=1)

    def forward(self, text, video, ind, conf=True):

        text_embd = {}

        for i, l in enumerate(self.video_GU):
            video[self.m[i]] = l(video[self.m[i]])

        for i, l in enumerate(self.text_GU):
            text_embd[self.m[i]] = l(text)


        #MOE weights computation + normalization ------------
        moe_weights = self.moe_fc(text)
        moe_weights = F.softmax(moe_weights, dim=1)

        available_m = np.zeros(moe_weights.size())

        i = 0
        for m in video:
            available_m[:,i] = ind[m]
            i += 1

        available_m = th.from_numpy(available_m).float()
        available_m = Variable(available_m.cuda())

        moe_weights = available_m[None, :, :] * moe_weights[:, None, :]

        norm_weights = th.sum(moe_weights, dim=2)
        norm_weights = norm_weights.unsqueeze(2)
        moe_weights = th.div(moe_weights, norm_weights)

        #MOE weights computation + normalization ------ DONE

        if conf:
            conf_matrix = Variable(th.zeros(len(text),len(text)).cuda())
            i = 0
            for m in video:
                video[m] = video[m].transpose(0,1)
                conf_matrix += moe_weights[:,:,i]*th.matmul(text_embd[m], video[m])
                i += 1

            return conf_matrix
        else:
            i = 0
            scores = Variable(th.zeros(len(text)).cuda())
            for m in video:
                moe_scores = moe_weights[:,:,i]
                moe_scores = th.diag(moe_scores) 
                text_embd[m] = moe_scores[:, None] *text_embd[m]*video[m]
                scores += th.sum(text_embd[m], dim=-1)
                i += 1
             
            return scores

class Gated_Embedding_Unit(nn.Module):
    def __init__(self, input_dimension, output_dimension, gating=True):
        super(Gated_Embedding_Unit, self).__init__()

        self.fc = nn.Linear(input_dimension, output_dimension)
        self.cg = Context_Gating(output_dimension)
        self.gating = gating
  
    def forward(self,x):
        
        x = self.fc(x)
        if self.gating:
            x = self.cg(x)
        x = F.normalize(x)

        return x


class Context_Gating(nn.Module):
    def __init__(self, dimension, add_batch_norm=True):
        super(Context_Gating, self).__init__()
        self.fc = nn.Linear(dimension, dimension)
        self.add_batch_norm = add_batch_norm
        self.batch_norm = nn.BatchNorm1d(dimension)
        
    def forward(self,x):
        x1 = self.fc(x)

        if self.add_batch_norm:
            x1 = self.batch_norm(x1) 

        x = th.cat((x, x1), 1)
        
        return F.glu(x,1)