From 9ee29cdcdbc2683cdd03ea33a40e9bec88781b3a Mon Sep 17 00:00:00 2001 From: Anujraaj Goyal Date: Mon, 10 Feb 2020 15:40:38 +0800 Subject: [PATCH 1/2] [AG] Migration to Py3 and pytorch compatibility issues fixed --- LSMDC.py | 125 ++++++++++++++--------------- MSRVTT.py | 4 +- loupe.py | 2 +- model.py | 2 +- train.py | 232 ++++++++++++++++++++++++++---------------------------- 5 files changed, 178 insertions(+), 187 deletions(-) diff --git a/LSMDC.py b/LSMDC.py index 2523ffa..e518abf 100644 --- a/LSMDC.py +++ b/LSMDC.py @@ -13,18 +13,21 @@ # limitations under the License. - import torch as th from torch.utils.data import Dataset -import numpy as np -import os -import math +import numpy as np +import os +import math import random + class LSMDC(Dataset): """LSMDC dataset.""" - def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy', coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024,verbose=False): + def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, + coco_visual_path='../X_train2014_resnet152.npy', coco_text_path='../w2v_coco_train2014_1.npy', + coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, + face_features_size=128, flow_features_size=1024, verbose=False): """ Args: """ @@ -32,11 +35,10 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.visual_features = np.load(clip_path) self.flow_features = np.load(flow_path) self.face_features = np.load(face_path) - self.audio_features = np.load(audio_features) - self.text_features = np.load(text_features) - + self.audio_features = np.load(audio_features, encoding='latin1') + self.text_features = np.load(text_features, encoding='latin1') - audio_sizes = map(len,self.audio_features) + audio_sizes = list(map(len, self.audio_features)) self.audio_sizes = np.array(audio_sizes) self.video_features_size = video_features_size @@ -44,13 +46,13 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.audio_features_size = audio_features_size self.flow_features_size = flow_features_size self.face_features_size = face_features_size - + self.max_len_text = max_words - - text_sizes = map(len,self.text_features) + + text_sizes = list(map(len, self.text_features)) self.text_sizes = np.array(text_sizes) self.text_sizes = self.text_sizes.astype(int) - + mask = self.text_sizes > 0 self.text_features = self.text_features[mask] @@ -60,49 +62,45 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.face_features = self.face_features[mask] self.audio_features = self.audio_features[mask] self.audio_sizes = self.audio_sizes[mask] - self.audio_sizes.astype(int) - + self.audio_sizes.astype(int) + self.max_len_audio = max(self.audio_sizes) - + audio_tensors = np.zeros((len(self.audio_features), - max(self.audio_sizes), self.audio_features[0].shape[1])) + max(self.audio_sizes), self.audio_features[0].shape[1])) for j in range(len(self.audio_features)): - audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j] - + audio_tensors[j, 0:self.audio_sizes[j], :] = self.audio_features[j] if coco: # adding coco data coco_visual = np.load(coco_visual_path) coco_text = np.load(coco_text_path) - self.n_lsmdc = len(self.visual_features) self.n_coco = len(coco_visual) - + self.visual_features = np.concatenate((self.visual_features, coco_visual), axis=0) self.text_features = np.concatenate((self.text_features, coco_text), axis=0) - text_sizes = map(len,self.text_features) + text_sizes = list(map(len, self.text_features)) self.text_sizes = np.array(text_sizes) self.text_sizes = self.text_sizes.astype(int) - self.coco_ind = np.zeros((self.n_lsmdc+self.n_coco)) + self.coco_ind = np.zeros((self.n_lsmdc + self.n_coco)) self.coco_ind[self.n_lsmdc:] = 1 else: self.n_lsmdc = len(self.visual_features) self.coco_ind = np.zeros((self.n_lsmdc)) - text_tensors = np.zeros((len(self.text_features), - max_words, self.text_features[0].shape[1])) - + max_words, self.text_features[0].shape[1])) for j in range(len(self.text_features)): - if self.text_sizes[j] > max_words: - text_tensors[j] = self.text_features[j][0:max_words,:] + if self.text_sizes[j] > max_words: + text_tensors[j] = self.text_features[j][0:max_words, :] else: - text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] - + text_tensors[j, 0:self.text_sizes[j], :] = self.text_features[j] + self.text_features = th.from_numpy(text_tensors) self.text_features = self.text_features.float() @@ -117,7 +115,7 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.face_features = th.from_numpy(self.face_features) self.face_features = self.face_features.float() - + def __len__(self): return len(self.text_features) @@ -128,7 +126,7 @@ def __getitem__(self, idx): if idx >= self.n_lsmdc: flow = th.zeros(self.flow_features_size) face = th.zeros(self.face_features_size) - audio = th.zeros(self.audio_features.size()[1],self.audio_features_size) + audio = th.zeros(self.audio_features.size()[1], self.audio_features_size) audio_size = 1 face_ind = 0 else: @@ -139,7 +137,7 @@ def __getitem__(self, idx): if th.sum(face) == 0: face_ind = 0 - return {'video': self.visual_features[idx], + return {'video': self.visual_features[idx], 'flow': flow, 'face': face, 'text': self.text_features[idx], @@ -150,27 +148,32 @@ def __getitem__(self, idx): 'text_size': self.text_sizes[idx] } - def getVideoFeatureSize(self): return self.video_features_size + def getTextFeatureSize(self): return self.text_features_size + def getAudioFeatureSize(self): return self.audio_features_size + def getFlowFeatureSize(self): return self.flow_features_size + def getText(self): return self.text_features - - def shorteningTextTensor(self,text_features, text_sizes): + def shorteningTextTensor(self, text_features, text_sizes): m = int(max(text_sizes)) - return text_features[:,0:m,:] + return text_features[:, 0:m, :] + class LSMDC_qcm(Dataset): """LSMDC dataset.""" - def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024): + def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30, + video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, + flow_features_size=1024): """ Args: """ @@ -179,9 +182,9 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.face_features = np.load(face_path) self.audio_features = np.load(audio_features) self.text_features = np.load(text_features) - print 'features loaded' + print('features loaded') - audio_sizes = map(len,self.audio_features) + audio_sizes = list(map(len, self.audio_features)) self.audio_sizes = np.array(audio_sizes) self.video_features_size = video_features_size @@ -189,33 +192,30 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.audio_features_size = audio_features_size self.flow_features_size = flow_features_size self.face_features_size = face_features_size - + self.max_len_text = max_words - - text_sizes = map(len,self.text_features) + + text_sizes = list(map(len, self.text_features)) self.text_sizes = np.array(text_sizes) self.text_sizes = self.text_sizes.astype(int) - - + self.max_len_audio = max(self.audio_sizes) - audio_tensors = np.zeros((len(self.audio_features), - max(self.audio_sizes), self.audio_features[0].shape[1])) + max(self.audio_sizes), self.audio_features[0].shape[1])) for j in range(len(self.audio_features)): - audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j] + audio_tensors[j, 0:self.audio_sizes[j], :] = self.audio_features[j] text_tensors = np.zeros((len(self.text_features), - max_words, self.text_features[0].shape[1])) - + max_words, self.text_features[0].shape[1])) for j in range(len(self.text_features)): - if self.text_sizes[j] > max_words: - text_tensors[j] = self.text_features[j][0:max_words,:] + if self.text_sizes[j] > max_words: + text_tensors[j] = self.text_features[j][0:max_words, :] else: - text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] - + text_tensors[j, 0:self.text_sizes[j], :] = self.text_features[j] + self.text_features = th.from_numpy(text_tensors) self.text_features = self.text_features.float() @@ -231,14 +231,11 @@ def __init__(self, clip_path, text_features, audio_features, flow_path, face_pat self.face_features = th.from_numpy(self.face_features) self.face_features = self.face_features.float() - def __len__(self): return len(self.visual_features) - - def __getitem__(self, tidx): - + idx, idx2 = tidx face_ind = 1 @@ -251,7 +248,7 @@ def __getitem__(self, tidx): if th.sum(face) == 0: face_ind = 0 - return {'video': self.visual_features[idx], + return {'video': self.visual_features[idx], 'flow': flow, 'face': face, 'text': self.text_features[idx2], @@ -261,18 +258,18 @@ def __getitem__(self, tidx): 'text_size': self.text_sizes[idx2] } - def getVideoFeatureSize(self): return self.video_features_size + def getTextFeatureSize(self): return self.text_features_size + def getAudioFeatureSize(self): return self.audio_features_size + def getFlowFeatureSize(self): return self.flow_features_size - - def shorteningTextTensor(self,text_features, text_sizes): + def shorteningTextTensor(self, text_features, text_sizes): m = int(max(text_sizes)) - return text_features[:,0:m,:] - + return text_features[:, 0:m, :] diff --git a/MSRVTT.py b/MSRVTT.py index 8163d47..b328840 100644 --- a/MSRVTT.py +++ b/MSRVTT.py @@ -32,7 +32,7 @@ def __init__(self, visual_features, flow_features, text_features, audio_features Args: """ self.max_words = max_words - print 'loading data ...' + print ('loading data ...') with open(train_list) as f: self.train_list = f.readlines() @@ -111,7 +111,7 @@ def __init__(self, visual_features, flow_features, text_features, audio_features self.face_retrieval = th.from_numpy(self.face_retrieval).float() self.text_retrieval = th.from_numpy(self.text_retrieval).float() - print 'done' + print ('done') def collate_data(self, data): video_tensor = np.zeros((len(data), 2048)) diff --git a/loupe.py b/loupe.py index 54bd2d8..466aeb7 100644 --- a/loupe.py +++ b/loupe.py @@ -60,7 +60,7 @@ def forward(self,x): vlad = F.normalize(vlad) # flattening + L2 norm - vlad = vlad.view(-1, self.cluster_size*self.feature_size) + vlad = vlad.reshape(-1, self.cluster_size*self.feature_size) vlad = F.normalize(vlad) return vlad diff --git a/model.py b/model.py index 5a645d8..7d2a25c 100644 --- a/model.py +++ b/model.py @@ -77,7 +77,7 @@ class MEE(nn.Module): def __init__(self, video_modality_dim, text_dim): super(MEE, self).__init__() - m = video_modality_dim.keys() + m = list(video_modality_dim.keys()) self.m = m diff --git a/train.py b/train.py index 8ee8f31..d90b504 100644 --- a/train.py +++ b/train.py @@ -27,159 +27,159 @@ from qcm_sampler import QCMSampler from MSR_sampler import MSRSampler - - parser = argparse.ArgumentParser(description='LSMDC2017') parser.add_argument('--coco', type=bool, default=False, - help='add coco dataset') + help='add coco dataset') parser.add_argument('--lr', type=float, default=0.0001, - help='initial learning rate') + help='initial learning rate') parser.add_argument('--epochs', type=int, default=50, - help='upper epoch limit') + help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=128, - help='batch size') + help='batch size') parser.add_argument('--text_cluster_size', type=int, default=32, - help='Text cluster size') + help='Text cluster size') parser.add_argument('--margin', type=float, default=0.2, - help='MaxMargin margin value') + help='MaxMargin margin value') parser.add_argument('--lr_decay', type=float, default=0.95, - help='Learning rate exp epoch decay') + help='Learning rate exp epoch decay') parser.add_argument('--n_display', type=int, default=100, - help='Information display frequence') + help='Information display frequence') parser.add_argument('--GPU', type=bool, default=True, - help='Use of GPU') + help='Use of GPU') parser.add_argument('--n_cpu', type=int, default=1, - help='Number of CPU') + help='Number of CPU') parser.add_argument('--model_name', type=str, default='test', - help='Model name') + help='Model name') parser.add_argument('--seed', type=int, default=1, - help='Initial Random Seed') + help='Initial Random Seed') parser.add_argument('--optimizer', type=str, default='adam', - help='optimizer') + help='optimizer') parser.add_argument('--momentum', type=float, default=0.9, - help='Nesterov Momentum for SGD') - + help='Nesterov Momentum for SGD') parser.add_argument('--eval_qcm', type=bool, default=False, - help='Eval or not QCM') + help='Eval or not QCM') parser.add_argument('--MSRVTT', type=bool, default=False, - help='MSRVTT') + help='MSRVTT') parser.add_argument('--coco_sampling_rate', type=float, default=1.0, - help='coco sampling rate') - + help='coco sampling rate') args = parser.parse_args() -print args +print(args) -root_feat = 'data' +root_feat = os.path.join('data', 'data') + +mp_visual_path = os.path.join(root_feat, 'X_resnet.npy') +mp_flow_path = os.path.join(root_feat, 'X_flow.npy') +mp_face_path = os.path.join(root_feat, 'X_face.npy') -mp_visual_path = os.path.join(root_feat,'X_resnet.npy') -mp_flow_path = os.path.join(root_feat,'X_flow.npy') -mp_face_path = os.path.join(root_feat,'X_face.npy') def verbose(epoch, status, metrics, name='TEST'): - print(name+' - epoch: %d, epoch status: %.2f, r@1: %.3f, r@5: %.3f, r@10: %.3f, mr: %d' % - (epoch + 1, status, - metrics['R1'], metrics['R5'], metrics['R10'], - metrics['MR'])) + print(name + ' - epoch: %d, epoch status: %.2f, r@1: %.3f, r@5: %.3f, r@10: %.3f, mr: %d' % + (epoch + 1, status, + metrics['R1'], metrics['R5'], metrics['R10'], + metrics['MR'])) def compute_metric(x): sx = np.sort(-x, axis=1) d = np.diag(-x) - d = d[:,np.newaxis] + d = d[:, np.newaxis] ind = sx - d ind = np.where(ind == 0) ind = ind[1] metrics = {} - metrics['R1'] = float(np.sum(ind == 0))/len(ind) - metrics['R5'] = float(np.sum(ind < 5))/len(ind) - metrics['R10'] = float(np.sum(ind < 10))/len(ind) + metrics['R1'] = float(np.sum(ind == 0)) / len(ind) + metrics['R5'] = float(np.sum(ind < 5)) / len(ind) + metrics['R10'] = float(np.sum(ind < 10)) / len(ind) metrics['MR'] = np.median(ind) + 1 return metrics + def make_tensor(l, max_len): - tensor = np.zeros((len(l),max_len,l[0].shape[-1])) + tensor = np.zeros((len(l), max_len, l[0].shape[-1])) for i in range(len(l)): if len(l[i]): - tensor[i,:min(max_len,l[i].shape[0]),:] = l[i][:min(max_len,l[i].shape[0])] + tensor[i, :min(max_len, l[i].shape[0]), :] = l[i][:min(max_len, l[i].shape[0])] return th.from_numpy(tensor).float() + # predefining random initial seeds th.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) -if args.eval_qcm and not(args.MSRVTT): - qcm_dataset = LD2.LSMDC_qcm(os.path.join(root_feat,'resnet-qcm.npy'), - os.path.join(root_feat,'w2v_LSMDC_qcm.npy'), os.path.join(root_feat,'X_audio_test.npy'), - os.path.join(root_feat,'flow-qcm.npy'), - os.path.join(root_feat,'face-qcm.npy')) - +if args.eval_qcm and not (args.MSRVTT): + qcm_dataset = LD2.LSMDC_qcm(os.path.join(root_feat, 'resnet-qcm.npy'), + os.path.join(root_feat, 'w2v_LSMDC_qcm.npy'), + os.path.join(root_feat, 'X_audio_test.npy'), + os.path.join(root_feat, 'flow-qcm.npy'), + os.path.join(root_feat, 'face-qcm.npy')) + qcm_sampler = QCMSampler(len(qcm_dataset)) qcm_dataloader = DataLoader(qcm_dataset, batch_size=500, sampler=qcm_sampler, num_workers=1) - qcm_gt_fn = os.path.join(root_feat,'multiple_choice_gt.txt') + qcm_gt_fn = os.path.join(root_feat, 'multiple_choice_gt.txt') qcm_gt = [line.rstrip('\n') for line in open(qcm_gt_fn)] - qcm_gt = np.array(map(int,qcm_gt)) + qcm_gt = np.array(map(int, qcm_gt)) -print 'Pre-loading features ... This may takes several minutes ...' +print('Pre-loading features ... This may takes several minutes ...') if args.MSRVTT: - visual_feat_path = os.path.join(root_feat,'resnet_features.pickle') - flow_feat_path = os.path.join(root_feat,'flow_features.pickle') - text_feat_path = os.path.join(root_feat,'w2v_MSRVTT.pickle') - audio_feat_path = os.path.join(root_feat,'audio_features.pickle') - face_feat_path = os.path.join(root_feat,'face_features.pickle') - train_list_path = os.path.join(root_feat,'train_list.txt') - test_list_path = os.path.join(root_feat,'test_list.txt') + visual_feat_path = os.path.join(root_feat, 'resnet_features.pickle') + flow_feat_path = os.path.join(root_feat, 'flow_features.pickle') + text_feat_path = os.path.join(root_feat, 'w2v_MSRVTT.pickle') + audio_feat_path = os.path.join(root_feat, 'audio_features.pickle') + face_feat_path = os.path.join(root_feat, 'face_features.pickle') + train_list_path = os.path.join(root_feat, 'train_list.txt') + test_list_path = os.path.join(root_feat, 'test_list.txt') dataset = MSR.MSRVTT(visual_feat_path, flow_feat_path, text_feat_path, - audio_feat_path, face_feat_path, train_list_path,test_list_path, coco=args.coco) + audio_feat_path, face_feat_path, train_list_path, test_list_path, coco=args.coco) msr_sampler = MSRSampler(dataset.n_MSR, dataset.n_coco, args.coco_sampling_rate) - + if args.coco: dataloader = DataLoader(dataset, batch_size=args.batch_size, - sampler=msr_sampler, num_workers=1,collate_fn=dataset.collate_data, drop_last=True) + sampler=msr_sampler, num_workers=1, collate_fn=dataset.collate_data, drop_last=True) else: dataloader = DataLoader(dataset, batch_size=args.batch_size, - shuffle=True, num_workers=1,collate_fn=dataset.collate_data, drop_last=True) + shuffle=True, num_workers=1, collate_fn=dataset.collate_data, drop_last=True) else: - path_to_text = os.path.join(root_feat,'w2v_LSMDC.npy') - path_to_audio = os.path.join(root_feat,'X_audio_train.npy') + path_to_text = os.path.join(root_feat, 'w2v_LSMDC.npy') + path_to_audio = os.path.join(root_feat, 'X_audio_train.npy') dataset = LD2.LSMDC(mp_visual_path, path_to_text, - path_to_audio, mp_flow_path, mp_face_path, coco=args.coco) + path_to_audio, mp_flow_path, mp_face_path, coco=args.coco) dataloader = DataLoader(dataset, batch_size=args.batch_size, - shuffle=True, num_workers=1, drop_last=True) - print 'Done.' + shuffle=True, num_workers=1, drop_last=True) + print('Done.') - print 'Reading test data ...' - resnet_features_path = os.path.join(root_feat,'resnet152-retrieval.npy.tensor.npy') - flow_features_path = os.path.join(root_feat,'flow-retrieval.npy.tensor.npy') - face_features_path = os.path.join(root_feat,'face-retrieval.npy.tensor.npy') - text_features_path = os.path.join(root_feat,'w2v_LSMDC_retrieval.npy') - audio_features_path = os.path.join(root_feat,'X_audio_retrieval.npy.tensor.npy') + print('Reading test data ...') + resnet_features_path = os.path.join(root_feat, 'resnet152-retrieval.npy.tensor.npy') + flow_features_path = os.path.join(root_feat, 'flow-retrieval.npy.tensor.npy') + face_features_path = os.path.join(root_feat, 'face-retrieval.npy.tensor.npy') + text_features_path = os.path.join(root_feat, 'w2v_LSMDC_retrieval.npy') + audio_features_path = os.path.join(root_feat, 'X_audio_retrieval.npy.tensor.npy') vid_retrieval = np.load(resnet_features_path) flow_retrieval = np.load(flow_features_path) face_retrieval = np.load(face_features_path) - text_retrieval = np.load(text_features_path) + text_retrieval = np.load(text_features_path, encoding='latin1') audio_retrieval = np.load(audio_features_path) - mm = max(map(len,text_retrieval)) + mm = max(map(len, text_retrieval)) - text_retrieval = make_tensor(text_retrieval,mm) + text_retrieval = make_tensor(text_retrieval, mm) vid_retrieval = th.from_numpy(vid_retrieval).float() flow_retrieval = th.from_numpy(flow_retrieval).float() @@ -192,24 +192,22 @@ def make_tensor(l, max_len): face_retrieval_val = face_retrieval audio_retrieval_val = audio_retrieval - - face_ind_test = np.load(os.path.join(root_feat,'no_face_ind_retrieval.npy')) + face_ind_test = np.load(os.path.join(root_feat, 'no_face_ind_retrieval.npy')) face_ind_test = 1 - face_ind_test -print 'Done.' +print('Done.') # Model -video_modality_dim = {'face': (128,128), 'audio': (128*16,128), -'visual': (2048,2048), 'motion': (1024,1024)} -net = Net(video_modality_dim,300, - audio_cluster=16,text_cluster=args.text_cluster_size) +video_modality_dim = {'face': (128, 128), 'audio': (128 * 16, 128), + 'visual': (2048, 2048), 'motion': (1024, 1024)} +net = Net(video_modality_dim, 300, + audio_cluster=16, text_cluster=args.text_cluster_size) net.train() if args.GPU: net.cuda() # Optimizers + Loss -max_margin = MaxMarginRankingLoss(margin=args.margin) - +max_margin = MaxMarginRankingLoss(margin=args.margin) if args.optimizer == 'adam': optimizer = optim.Adam(net.parameters(), lr=args.lr) @@ -223,11 +221,11 @@ def make_tensor(l, max_len): dataset_size = len(dataset) lr_decay = args.lr_decay -print 'Starting training loop ...' +print('Starting training loop ...') for epoch in range(args.epochs): running_loss = 0.0 - print 'epoch: %d'%epoch + print('epoch: %d' % epoch) for i_batch, sample_batched in enumerate(dataloader): @@ -236,11 +234,10 @@ def make_tensor(l, max_len): audio = sample_batched['audio'] else: captions = dataset.shorteningTextTensor(sample_batched['text'], - sample_batched['text_size']) - + sample_batched['text_size']) + audio = dataset.shorteningTextTensor(sample_batched['audio'], - sample_batched['audio_size']) - + sample_batched['audio_size']) face = sample_batched['face'] video = sample_batched['video'] @@ -251,31 +248,32 @@ def make_tensor(l, max_len): ind = {} ind['face'] = face_ind ind['visual'] = np.ones((len(face_ind))) - ind['motion'] = 1 - coco_ind + ind['motion'] = 1 - coco_ind ind['audio'] = 1 - coco_ind if args.GPU: captions, video = Variable(captions.cuda()), Variable(video.cuda()) - audio, flow = Variable(audio.cuda()), Variable(flow.cuda()) + audio, flow = Variable(audio.cuda()), Variable(flow.cuda()) face = Variable(face.cuda()) - optimizer.zero_grad() confusion_matrix = net(captions, - {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) loss = max_margin(confusion_matrix) loss.backward() optimizer.step() - running_loss += loss.data[0] - - if (i_batch+1) % n_display == 0: - print 'Epoch %d, Epoch status: %.2f, Training loss: %.4f'%(epoch + 1, - args.batch_size*float(i_batch)/dataset_size,running_loss/n_display) + running_loss += loss.data + + if (i_batch + 1) % n_display == 0: + print('Epoch %d, Epoch status: %.2f, Training loss: %.4f' % (epoch + 1, + args.batch_size * float( + i_batch) / dataset_size, + running_loss / n_display)) running_loss = 0.0 - print 'evaluating epoch %d ...'%(epoch+1) - net.eval() + print('evaluating epoch %d ...' % (epoch + 1)) + net.eval() if args.MSRVTT: retrieval_samples = dataset.getRetrievalSamples() @@ -294,10 +292,10 @@ def make_tensor(l, max_len): ind['audio'] = np.ones((len(face_ind))) conf = net(captions, - {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) confusion_matrix = conf.data.cpu().float().numpy() metrics = compute_metric(confusion_matrix) - verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MSRVTT') + verbose(epoch, args.batch_size * float(i_batch) / dataset_size, metrics, name='MSRVTT') else: video = Variable(vid_retrieval_val.cuda(), volatile=True) @@ -305,7 +303,7 @@ def make_tensor(l, max_len): audio = Variable(audio_retrieval_val.cuda(), volatile=True) flow = Variable(flow_retrieval_val.cuda(), volatile=True) face = Variable(face_retrieval_val.cuda(), volatile=True) - + ind = {} ind['face'] = face_ind_test ind['visual'] = np.ones((len(face_ind_test))) @@ -313,24 +311,25 @@ def make_tensor(l, max_len): ind['audio'] = np.ones((len(face_ind_test))) conf = net(captions, - {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) confusion_matrix = conf.data.cpu().float().numpy() metrics = compute_metric(confusion_matrix) - verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MPII') - + verbose(epoch, args.batch_size * float(i_batch) / dataset_size, metrics, name='MPII') + net.train() - if args.eval_qcm and not(args.MSRVTT): - print 'LSMDC Multiple-Choice evaluation computation' + if args.eval_qcm and not (args.MSRVTT): + print('LSMDC Multiple-Choice evaluation computation') net.eval() scores = [] for i_batch, sample_batched in enumerate(qcm_dataloader): captions = sample_batched['text'] - + audio = qcm_dataset.shorteningTextTensor(sample_batched['audio'], - sample_batched['audio_size']) - + sample_batched['audio_size']) + + video = sample_batched['video'] video = sample_batched['video'] flow = sample_batched['flow'] face = sample_batched['face'] @@ -344,7 +343,7 @@ def make_tensor(l, max_len): if args.GPU: captions, video = Variable(captions.cuda(), volatile=True), Variable(video.cuda(), volatile=True) - audio, flow = Variable(audio.cuda(), volatile=True), Variable(flow.cuda(), volatile=True) + audio, flow = Variable(audio.cuda(), volatile=True), Variable(flow.cuda(), volatile=True) face = Variable(face.cuda(), volatile=True) s = net(captions, {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, False) @@ -352,18 +351,13 @@ def make_tensor(l, max_len): scores.extend(s) scores = np.array(scores) - scores = np.reshape(scores, (len(qcm_dataset),5)) - pred = np.argmax(scores, axis=1)+1 - - accuracy_qcm = sum(pred == qcm_gt)/float(len(pred)) - print 'Accuracy Multiple-Choice: %.3f'%accuracy_qcm - + scores = np.reshape(scores, (len(qcm_dataset), 5)) + pred = np.argmax(scores, axis=1) + 1 + + accuracy_qcm = sum(pred == qcm_gt) / float(len(pred)) + print('Accuracy Multiple-Choice: %.3f' % accuracy_qcm) + net.train() for param_group in optimizer.param_groups: param_group['lr'] *= lr_decay - - - - - From 84c3b2b200834c0d1ec36bc64254008fe03c5e99 Mon Sep 17 00:00:00 2001 From: Anujraaj Goyal Date: Tue, 11 Feb 2020 19:26:45 +0800 Subject: [PATCH 2/2] Migration to Py3 for MSRVTT dataset --- MSRVTT.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/MSRVTT.py b/MSRVTT.py index b328840..be6a45c 100644 --- a/MSRVTT.py +++ b/MSRVTT.py @@ -26,8 +26,8 @@ class MSRVTT(Dataset): """LSMDC dataset.""" def __init__(self, visual_features, flow_features, text_features, audio_features, - face_features, train_list, test_list, coco_visual_path='data/X_train2014_resnet152.npy', - coco_text_path='data/w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False): + face_features, train_list, test_list, coco_visual_path='data/data/X_train2014_resnet152.npy', + coco_text_path='data/data/w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False): """ Args: """ @@ -46,26 +46,26 @@ def __init__(self, visual_features, flow_features, text_features, audio_features pickle_in = open(visual_features,'rb') - self.visual_features = pickle.load(pickle_in) + self.visual_features = pickle.load(pickle_in, encoding='latin1') pickle_in = open(flow_features,'rb') - self.flow_features = pickle.load(pickle_in) + self.flow_features = pickle.load(pickle_in, encoding='latin1') pickle_in = open(audio_features,'rb') - self.audio_features = pickle.load(pickle_in) + self.audio_features = pickle.load(pickle_in, encoding='latin1') pickle_in = open(text_features,'rb') - self.text_features = pickle.load(pickle_in) + self.text_features = pickle.load(pickle_in, encoding='latin1') pickle_in = open(face_features,'rb') - self.face_features = pickle.load(pickle_in) + self.face_features = pickle.load(pickle_in, encoding='latin1') self.coco = coco if coco: # adding coco data self.coco_visual = np.load(coco_visual_path) - self.coco_text = np.load(coco_text_path) + self.coco_text = np.load(coco_text_path, encoding='latin1') self.n_MSR = len(self.train_list) self.n_coco = len(self.coco_visual)