Skip to content

Commit

Permalink
Merge pull request #14 from goel42/master
Browse files Browse the repository at this point in the history
Migrated Codebase to Python3. Proper working and similar results. All checks done.
  • Loading branch information
antoine77340 authored Jul 21, 2020
2 parents d3c0c9c + 84c3b2b commit a53979f
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 195 deletions.
125 changes: 61 additions & 64 deletions LSMDC.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,46 @@
# limitations under the License.



import torch as th
from torch.utils.data import Dataset
import numpy as np
import os
import math
import numpy as np
import os
import math
import random


class LSMDC(Dataset):
"""LSMDC dataset."""

def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy', coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024,verbose=False):
def __init__(self, clip_path, text_features, audio_features, flow_path, face_path,
coco_visual_path='../X_train2014_resnet152.npy', coco_text_path='../w2v_coco_train2014_1.npy',
coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128,
face_features_size=128, flow_features_size=1024, verbose=False):
"""
Args:
"""

self.visual_features = np.load(clip_path)
self.flow_features = np.load(flow_path)
self.face_features = np.load(face_path)
self.audio_features = np.load(audio_features)
self.text_features = np.load(text_features)

self.audio_features = np.load(audio_features, encoding='latin1')
self.text_features = np.load(text_features, encoding='latin1')

audio_sizes = map(len,self.audio_features)
audio_sizes = list(map(len, self.audio_features))
self.audio_sizes = np.array(audio_sizes)

self.video_features_size = video_features_size
self.text_features_size = text_features_size
self.audio_features_size = audio_features_size
self.flow_features_size = flow_features_size
self.face_features_size = face_features_size

self.max_len_text = max_words
text_sizes = map(len,self.text_features)

text_sizes = list(map(len, self.text_features))
self.text_sizes = np.array(text_sizes)
self.text_sizes = self.text_sizes.astype(int)

mask = self.text_sizes > 0

self.text_features = self.text_features[mask]
Expand All @@ -60,49 +62,45 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat
self.face_features = self.face_features[mask]
self.audio_features = self.audio_features[mask]
self.audio_sizes = self.audio_sizes[mask]
self.audio_sizes.astype(int)
self.audio_sizes.astype(int)

self.max_len_audio = max(self.audio_sizes)

audio_tensors = np.zeros((len(self.audio_features),
max(self.audio_sizes), self.audio_features[0].shape[1]))
max(self.audio_sizes), self.audio_features[0].shape[1]))

for j in range(len(self.audio_features)):
audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j]

audio_tensors[j, 0:self.audio_sizes[j], :] = self.audio_features[j]

if coco:
# adding coco data
coco_visual = np.load(coco_visual_path)
coco_text = np.load(coco_text_path)


self.n_lsmdc = len(self.visual_features)
self.n_coco = len(coco_visual)

self.visual_features = np.concatenate((self.visual_features, coco_visual), axis=0)
self.text_features = np.concatenate((self.text_features, coco_text), axis=0)

text_sizes = map(len,self.text_features)
text_sizes = list(map(len, self.text_features))
self.text_sizes = np.array(text_sizes)
self.text_sizes = self.text_sizes.astype(int)
self.coco_ind = np.zeros((self.n_lsmdc+self.n_coco))
self.coco_ind = np.zeros((self.n_lsmdc + self.n_coco))
self.coco_ind[self.n_lsmdc:] = 1
else:
self.n_lsmdc = len(self.visual_features)
self.coco_ind = np.zeros((self.n_lsmdc))


text_tensors = np.zeros((len(self.text_features),
max_words, self.text_features[0].shape[1]))

max_words, self.text_features[0].shape[1]))

for j in range(len(self.text_features)):
if self.text_sizes[j] > max_words:
text_tensors[j] = self.text_features[j][0:max_words,:]
if self.text_sizes[j] > max_words:
text_tensors[j] = self.text_features[j][0:max_words, :]
else:
text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j]
text_tensors[j, 0:self.text_sizes[j], :] = self.text_features[j]

self.text_features = th.from_numpy(text_tensors)
self.text_features = self.text_features.float()

Expand All @@ -117,7 +115,7 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat

self.face_features = th.from_numpy(self.face_features)
self.face_features = self.face_features.float()

def __len__(self):
return len(self.text_features)

Expand All @@ -128,7 +126,7 @@ def __getitem__(self, idx):
if idx >= self.n_lsmdc:
flow = th.zeros(self.flow_features_size)
face = th.zeros(self.face_features_size)
audio = th.zeros(self.audio_features.size()[1],self.audio_features_size)
audio = th.zeros(self.audio_features.size()[1], self.audio_features_size)
audio_size = 1
face_ind = 0
else:
Expand All @@ -139,7 +137,7 @@ def __getitem__(self, idx):

if th.sum(face) == 0:
face_ind = 0
return {'video': self.visual_features[idx],
return {'video': self.visual_features[idx],
'flow': flow,
'face': face,
'text': self.text_features[idx],
Expand All @@ -150,27 +148,32 @@ def __getitem__(self, idx):
'text_size': self.text_sizes[idx]
}


def getVideoFeatureSize(self):
return self.video_features_size

def getTextFeatureSize(self):
return self.text_features_size

def getAudioFeatureSize(self):
return self.audio_features_size

def getFlowFeatureSize(self):
return self.flow_features_size

def getText(self):
return self.text_features


def shorteningTextTensor(self,text_features, text_sizes):
def shorteningTextTensor(self, text_features, text_sizes):
m = int(max(text_sizes))
return text_features[:,0:m,:]
return text_features[:, 0:m, :]


class LSMDC_qcm(Dataset):
"""LSMDC dataset."""

def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024):
def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30,
video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128,
flow_features_size=1024):
"""
Args:
"""
Expand All @@ -179,43 +182,40 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat
self.face_features = np.load(face_path)
self.audio_features = np.load(audio_features)
self.text_features = np.load(text_features)
print 'features loaded'
print('features loaded')

audio_sizes = map(len,self.audio_features)
audio_sizes = list(map(len, self.audio_features))
self.audio_sizes = np.array(audio_sizes)

self.video_features_size = video_features_size
self.text_features_size = text_features_size
self.audio_features_size = audio_features_size
self.flow_features_size = flow_features_size
self.face_features_size = face_features_size

self.max_len_text = max_words
text_sizes = map(len,self.text_features)

text_sizes = list(map(len, self.text_features))
self.text_sizes = np.array(text_sizes)
self.text_sizes = self.text_sizes.astype(int)



self.max_len_audio = max(self.audio_sizes)


audio_tensors = np.zeros((len(self.audio_features),
max(self.audio_sizes), self.audio_features[0].shape[1]))
max(self.audio_sizes), self.audio_features[0].shape[1]))

for j in range(len(self.audio_features)):
audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j]
audio_tensors[j, 0:self.audio_sizes[j], :] = self.audio_features[j]

text_tensors = np.zeros((len(self.text_features),
max_words, self.text_features[0].shape[1]))

max_words, self.text_features[0].shape[1]))

for j in range(len(self.text_features)):
if self.text_sizes[j] > max_words:
text_tensors[j] = self.text_features[j][0:max_words,:]
if self.text_sizes[j] > max_words:
text_tensors[j] = self.text_features[j][0:max_words, :]
else:
text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j]
text_tensors[j, 0:self.text_sizes[j], :] = self.text_features[j]

self.text_features = th.from_numpy(text_tensors)
self.text_features = self.text_features.float()

Expand All @@ -231,14 +231,11 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat
self.face_features = th.from_numpy(self.face_features)
self.face_features = self.face_features.float()


def __len__(self):
return len(self.visual_features)



def __getitem__(self, tidx):

idx, idx2 = tidx

face_ind = 1
Expand All @@ -251,7 +248,7 @@ def __getitem__(self, tidx):
if th.sum(face) == 0:
face_ind = 0

return {'video': self.visual_features[idx],
return {'video': self.visual_features[idx],
'flow': flow,
'face': face,
'text': self.text_features[idx2],
Expand All @@ -261,18 +258,18 @@ def __getitem__(self, tidx):
'text_size': self.text_sizes[idx2]
}


def getVideoFeatureSize(self):
return self.video_features_size

def getTextFeatureSize(self):
return self.text_features_size

def getAudioFeatureSize(self):
return self.audio_features_size

def getFlowFeatureSize(self):
return self.flow_features_size


def shorteningTextTensor(self,text_features, text_sizes):
def shorteningTextTensor(self, text_features, text_sizes):
m = int(max(text_sizes))
return text_features[:,0:m,:]

return text_features[:, 0:m, :]
20 changes: 10 additions & 10 deletions MSRVTT.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ class MSRVTT(Dataset):
"""LSMDC dataset."""

def __init__(self, visual_features, flow_features, text_features, audio_features,
face_features, train_list, test_list, coco_visual_path='data/X_train2014_resnet152.npy',
coco_text_path='data/w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False):
face_features, train_list, test_list, coco_visual_path='data/data/X_train2014_resnet152.npy',
coco_text_path='data/data/w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False):
"""
Args:
"""
self.max_words = max_words
print 'loading data ...'
print ('loading data ...')

with open(train_list) as f:
self.train_list = f.readlines()
Expand All @@ -46,26 +46,26 @@ def __init__(self, visual_features, flow_features, text_features, audio_features


pickle_in = open(visual_features,'rb')
self.visual_features = pickle.load(pickle_in)
self.visual_features = pickle.load(pickle_in, encoding='latin1')

pickle_in = open(flow_features,'rb')
self.flow_features = pickle.load(pickle_in)
self.flow_features = pickle.load(pickle_in, encoding='latin1')

pickle_in = open(audio_features,'rb')
self.audio_features = pickle.load(pickle_in)
self.audio_features = pickle.load(pickle_in, encoding='latin1')

pickle_in = open(text_features,'rb')
self.text_features = pickle.load(pickle_in)
self.text_features = pickle.load(pickle_in, encoding='latin1')

pickle_in = open(face_features,'rb')
self.face_features = pickle.load(pickle_in)
self.face_features = pickle.load(pickle_in, encoding='latin1')

self.coco = coco

if coco:
# adding coco data
self.coco_visual = np.load(coco_visual_path)
self.coco_text = np.load(coco_text_path)
self.coco_text = np.load(coco_text_path, encoding='latin1')

self.n_MSR = len(self.train_list)
self.n_coco = len(self.coco_visual)
Expand Down Expand Up @@ -111,7 +111,7 @@ def __init__(self, visual_features, flow_features, text_features, audio_features
self.face_retrieval = th.from_numpy(self.face_retrieval).float()
self.text_retrieval = th.from_numpy(self.text_retrieval).float()

print 'done'
print ('done')

def collate_data(self, data):
video_tensor = np.zeros((len(data), 2048))
Expand Down
2 changes: 1 addition & 1 deletion loupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def forward(self,x):
vlad = F.normalize(vlad)

# flattening + L2 norm
vlad = vlad.view(-1, self.cluster_size*self.feature_size)
vlad = vlad.reshape(-1, self.cluster_size*self.feature_size)
vlad = F.normalize(vlad)

return vlad
Expand Down
2 changes: 1 addition & 1 deletion model.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class MEE(nn.Module):
def __init__(self, video_modality_dim, text_dim):
super(MEE, self).__init__()

m = video_modality_dim.keys()
m = list(video_modality_dim.keys())

self.m = m

Expand Down
Loading

0 comments on commit a53979f

Please sign in to comment.