From b342e5477fc5bbf82c349461bfa9a3288098ba0a Mon Sep 17 00:00:00 2001 From: "Antoine Miech \"WILLOW" Date: Wed, 4 Apr 2018 17:27:50 +0200 Subject: [PATCH] first code commit --- LICENSE | 202 ++++++++++++++++++++++++++ LSMDC.py | 278 ++++++++++++++++++++++++++++++++++++ MSRVTT.py | 190 +++++++++++++++++++++++++ MSR_sampler.py | 41 ++++++ loss.py | 63 +++++++++ loupe.py | 105 ++++++++++++++ model.py | 151 ++++++++++++++++++++ qcm_sampler.py | 36 +++++ train.py | 376 +++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 1442 insertions(+) create mode 100644 LICENSE create mode 100644 LSMDC.py create mode 100644 MSRVTT.py create mode 100644 MSR_sampler.py create mode 100644 loss.py create mode 100644 loupe.py create mode 100644 model.py create mode 100644 qcm_sampler.py create mode 100644 train.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2325b40 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018 Antoine Miech + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LSMDC.py b/LSMDC.py new file mode 100644 index 0000000..2523ffa --- /dev/null +++ b/LSMDC.py @@ -0,0 +1,278 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import torch as th +from torch.utils.data import Dataset +import numpy as np +import os +import math +import random + +class LSMDC(Dataset): + """LSMDC dataset.""" + + def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy', coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024,verbose=False): + """ + Args: + """ + + self.visual_features = np.load(clip_path) + self.flow_features = np.load(flow_path) + self.face_features = np.load(face_path) + self.audio_features = np.load(audio_features) + self.text_features = np.load(text_features) + + + audio_sizes = map(len,self.audio_features) + self.audio_sizes = np.array(audio_sizes) + + self.video_features_size = video_features_size + self.text_features_size = text_features_size + self.audio_features_size = audio_features_size + self.flow_features_size = flow_features_size + self.face_features_size = face_features_size + + self.max_len_text = max_words + + text_sizes = map(len,self.text_features) + self.text_sizes = np.array(text_sizes) + self.text_sizes = self.text_sizes.astype(int) + + mask = self.text_sizes > 0 + + self.text_features = self.text_features[mask] + self.text_sizes = self.text_sizes[mask] + self.visual_features = self.visual_features[mask] + self.flow_features = self.flow_features[mask] + self.face_features = self.face_features[mask] + self.audio_features = self.audio_features[mask] + self.audio_sizes = self.audio_sizes[mask] + self.audio_sizes.astype(int) + + self.max_len_audio = max(self.audio_sizes) + + audio_tensors = np.zeros((len(self.audio_features), + max(self.audio_sizes), self.audio_features[0].shape[1])) + + for j in range(len(self.audio_features)): + audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j] + + + if coco: + # adding coco data + coco_visual = np.load(coco_visual_path) + coco_text = np.load(coco_text_path) + + + self.n_lsmdc = len(self.visual_features) + self.n_coco = len(coco_visual) + + self.visual_features = np.concatenate((self.visual_features, coco_visual), axis=0) + self.text_features = np.concatenate((self.text_features, coco_text), axis=0) + + text_sizes = map(len,self.text_features) + self.text_sizes = np.array(text_sizes) + self.text_sizes = self.text_sizes.astype(int) + self.coco_ind = np.zeros((self.n_lsmdc+self.n_coco)) + self.coco_ind[self.n_lsmdc:] = 1 + else: + self.n_lsmdc = len(self.visual_features) + self.coco_ind = np.zeros((self.n_lsmdc)) + + + text_tensors = np.zeros((len(self.text_features), + max_words, self.text_features[0].shape[1])) + + + for j in range(len(self.text_features)): + if self.text_sizes[j] > max_words: + text_tensors[j] = self.text_features[j][0:max_words,:] + else: + text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] + + self.text_features = th.from_numpy(text_tensors) + self.text_features = self.text_features.float() + + self.audio_features = th.from_numpy(audio_tensors) + self.audio_features = self.audio_features.float() + + self.flow_features = th.from_numpy(self.flow_features) + self.flow_features = self.flow_features.float() + + self.visual_features = th.from_numpy(self.visual_features) + self.visual_features = self.visual_features.float() + + self.face_features = th.from_numpy(self.face_features) + self.face_features = self.face_features.float() + + def __len__(self): + return len(self.text_features) + + def __getitem__(self, idx): + + face_ind = 1 + + if idx >= self.n_lsmdc: + flow = th.zeros(self.flow_features_size) + face = th.zeros(self.face_features_size) + audio = th.zeros(self.audio_features.size()[1],self.audio_features_size) + audio_size = 1 + face_ind = 0 + else: + flow = self.flow_features[idx] + face = self.face_features[idx] + audio = self.audio_features[idx] + audio_size = self.audio_sizes[idx] + + if th.sum(face) == 0: + face_ind = 0 + return {'video': self.visual_features[idx], + 'flow': flow, + 'face': face, + 'text': self.text_features[idx], + 'audio': audio, + 'audio_size': audio_size, + 'coco_ind': self.coco_ind[idx], + 'face_ind': face_ind, + 'text_size': self.text_sizes[idx] + } + + + def getVideoFeatureSize(self): + return self.video_features_size + def getTextFeatureSize(self): + return self.text_features_size + def getAudioFeatureSize(self): + return self.audio_features_size + def getFlowFeatureSize(self): + return self.flow_features_size + def getText(self): + return self.text_features + + + def shorteningTextTensor(self,text_features, text_sizes): + m = int(max(text_sizes)) + return text_features[:,0:m,:] + +class LSMDC_qcm(Dataset): + """LSMDC dataset.""" + + def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024): + """ + Args: + """ + self.visual_features = np.load(clip_path) + self.flow_features = np.load(flow_path) + self.face_features = np.load(face_path) + self.audio_features = np.load(audio_features) + self.text_features = np.load(text_features) + print 'features loaded' + + audio_sizes = map(len,self.audio_features) + self.audio_sizes = np.array(audio_sizes) + + self.video_features_size = video_features_size + self.text_features_size = text_features_size + self.audio_features_size = audio_features_size + self.flow_features_size = flow_features_size + self.face_features_size = face_features_size + + self.max_len_text = max_words + + text_sizes = map(len,self.text_features) + self.text_sizes = np.array(text_sizes) + self.text_sizes = self.text_sizes.astype(int) + + + self.max_len_audio = max(self.audio_sizes) + + + audio_tensors = np.zeros((len(self.audio_features), + max(self.audio_sizes), self.audio_features[0].shape[1])) + + for j in range(len(self.audio_features)): + audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j] + + text_tensors = np.zeros((len(self.text_features), + max_words, self.text_features[0].shape[1])) + + + for j in range(len(self.text_features)): + if self.text_sizes[j] > max_words: + text_tensors[j] = self.text_features[j][0:max_words,:] + else: + text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] + + self.text_features = th.from_numpy(text_tensors) + self.text_features = self.text_features.float() + + self.audio_features = th.from_numpy(audio_tensors) + self.audio_features = self.audio_features.float() + + self.flow_features = th.from_numpy(self.flow_features) + self.flow_features = self.flow_features.float() + + self.visual_features = th.from_numpy(self.visual_features) + self.visual_features = self.visual_features.float() + + self.face_features = th.from_numpy(self.face_features) + self.face_features = self.face_features.float() + + + def __len__(self): + return len(self.visual_features) + + + + def __getitem__(self, tidx): + + idx, idx2 = tidx + + face_ind = 1 + + flow = self.flow_features[idx] + face = self.face_features[idx] + audio = self.audio_features[idx] + audio_size = self.audio_sizes[idx] + + if th.sum(face) == 0: + face_ind = 0 + + return {'video': self.visual_features[idx], + 'flow': flow, + 'face': face, + 'text': self.text_features[idx2], + 'audio': audio, + 'face_ind': face_ind, + 'audio_size': audio_size, + 'text_size': self.text_sizes[idx2] + } + + + def getVideoFeatureSize(self): + return self.video_features_size + def getTextFeatureSize(self): + return self.text_features_size + def getAudioFeatureSize(self): + return self.audio_features_size + def getFlowFeatureSize(self): + return self.flow_features_size + + + def shorteningTextTensor(self,text_features, text_sizes): + m = int(max(text_sizes)) + return text_features[:,0:m,:] + diff --git a/MSRVTT.py b/MSRVTT.py new file mode 100644 index 0000000..c51abfd --- /dev/null +++ b/MSRVTT.py @@ -0,0 +1,190 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import torch as th +from torch.utils.data import Dataset +import numpy as np +import os +import math +import random +import pickle + +class MSRVTT(Dataset): + """LSMDC dataset.""" + + def __init__(self, visual_features, flow_features, text_features, audio_features, face_features, train_list, test_list, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False): + """ + Args: + """ + self.max_words = max_words + print 'loading data ...' + + with open(train_list) as f: + self.train_list = f.readlines() + + self.train_list = [x.strip() for x in self.train_list] + + with open(test_list) as f: + self.test_list = f.readlines() + + self.test_list = [x.strip() for x in self.test_list] + + + pickle_in = open(visual_features,'rb') + self.visual_features = pickle.load(pickle_in) + + pickle_in = open(flow_features,'rb') + self.flow_features = pickle.load(pickle_in) + + pickle_in = open(audio_features,'rb') + self.audio_features = pickle.load(pickle_in) + + pickle_in = open(text_features,'rb') + self.text_features = pickle.load(pickle_in) + + pickle_in = open(face_features,'rb') + self.face_features = pickle.load(pickle_in) + + self.coco = coco + + if coco: + # adding coco data + self.coco_visual = np.load(coco_visual_path) + self.coco_text = np.load(coco_text_path) + + self.n_MSR = len(self.train_list) + self.n_coco = len(self.coco_visual) + + self.coco_ind = np.zeros((self.n_MSR+self.n_coco)) + self.coco_ind[self.n_MSR:] = 1 + + else: + self.n_MSR = len(self.train_list) + self.coco_ind = np.zeros((self.n_MSR)) + + + # computing retrieval + + self.video_retrieval = np.zeros((len(self.test_list),2048)) + self.flow_retrieval = np.zeros((len(self.test_list),1024)) + self.audio_retrieval = np.zeros((len(self.test_list), max_words, 128)) + self.face_retrieval = np.zeros((len(self.test_list), 128)) + self.text_retrieval = np.zeros((len(self.test_list), max_words, 300)) + self.face_ind_retrieval = np.ones((len(self.test_list))) + + for i in range(len(self.test_list)): + self.video_retrieval[i] = self.visual_features[self.test_list[i]] + self.flow_retrieval[i] = self.flow_features[self.test_list[i]] + + if len(self.face_features[self.test_list[i]]) > 0: + self.face_retrieval[i] = self.face_features[self.test_list[i]] + + if np.sum(self.face_retrieval[i]) == 0: + self.face_ind_retrieval[i] = 0 + + la = len(self.audio_features[self.test_list[i]]) + self.audio_retrieval[i,:min(max_words,la),:] = self.audio_features[self.test_list[i]][:min(max_words,la)] + + lt = len(self.text_features[self.test_list[i]][0]) + self.text_retrieval[i,:min(max_words,lt),:] = self.text_features[self.test_list[i]][0][:min(max_words,lt)] + + + self.video_retrieval = th.from_numpy(self.video_retrieval).float() + self.flow_retrieval = th.from_numpy(self.flow_retrieval).float() + self.audio_retrieval = th.from_numpy(self.audio_retrieval).float() + self.face_retrieval = th.from_numpy(self.face_retrieval).float() + self.text_retrieval = th.from_numpy(self.text_retrieval).float() + + print 'done' + + def collate_data(self, data): + video_tensor = np.zeros((len(data), 2048)) + flow_tensor = np.zeros((len(data), 1024)) + face_tensor = np.zeros((len(data), 128)) + audio_tensor = np.zeros((len(data), self.max_words,128)) + text_tensor = np.zeros((len(data), self.max_words, 300)) + coco_ind = np.zeros((len(data))) + face_ind = np.zeros((len(data))) + + for i in range(len(data)): + + coco_ind[i] = data[i]['coco_ind'] + face_ind[i] = data[i]['face_ind'] + video_tensor[i] = data[i]['video'] + flow_tensor[i] = data[i]['flow'] + + if len(data[i]['face']) > 0: + face_tensor[i] = data[i]['face'] + + la = len(data[i]['audio']) + audio_tensor[i,:min(la,self.max_words), :] = data[i]['audio'][:min(self.max_words,la)] + + lt = len(data[i]['text']) + text_tensor[i,:min(lt,self.max_words), :] = data[i]['text'][:min(self.max_words,lt)] + + + return {'video': th.from_numpy(video_tensor).float(), + 'flow': th.from_numpy(flow_tensor).float(), + 'face': th.from_numpy(face_tensor).float(), + 'coco_ind': coco_ind, + 'face_ind': face_ind, + 'text': th.from_numpy(text_tensor).float(), + 'audio': th.from_numpy(audio_tensor).float()} + + + def __len__(self): + return len(self.coco_ind) + + def __getitem__(self, idx): + + face_ind = 1 + if idx < self.n_MSR: + vid = self.train_list[idx] + text = self.text_features[vid] + r = random.randint(0, len(text)-1) + text = text[r] + flow = self.flow_features[vid] + audio = self.audio_features[vid] + video = self.visual_features[vid] + face = self.face_features[vid] + + if np.sum(face) == 0: + face_ind = 0 + elif self.coco: + video = self.coco_visual[idx-self.n_MSR] + text = self.coco_text[idx-self.n_MSR] + audio = th.zeros(1,128) + flow = th.zeros(1024) + face = th.zeros(128) + face_ind = 0 + + return {'video': video, + 'flow': flow, + 'face': face, + 'text': text, + 'coco_ind': self.coco_ind[idx], + 'face_ind': face_ind, + 'audio': audio + } + + def getRetrievalSamples(self): + return {'video': self.video_retrieval, + 'flow': self.flow_retrieval, + 'text': self.text_retrieval, + 'face': self.face_retrieval, + 'face_ind': self.face_ind_retrieval, + 'audio': self.audio_retrieval} + diff --git a/MSR_sampler.py b/MSR_sampler.py new file mode 100644 index 0000000..eb6c285 --- /dev/null +++ b/MSR_sampler.py @@ -0,0 +1,41 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import torch as th +from torch.utils.data.sampler import Sampler +import numpy as np + +class MSRSampler(Sampler): + + def __init__(self, n_MSR, n_COCO, sampling_rate): + self.n_MSR = n_MSR + self.n_COCO = n_COCO + self.sampling_rate = sampling_rate + + def __iter__(self): + idx_MSR = np.arange(self.n_MSR) + idx_coco = np.arange(self.n_MSR,self.n_MSR+self.n_COCO) + + np.random.shuffle(idx_coco) + idx_coco = idx_coco[:min(self.n_COCO,int(self.sampling_rate*self.n_MSR))] + + idx = np.concatenate((idx_MSR,idx_coco), axis=0) + np.random.shuffle(idx) + + return iter(idx) + + def __len__(self): + return self.n_MSR+int(self.sampling_rate*self.n_COCO) diff --git a/loss.py b/loss.py new file mode 100644 index 0000000..2189a0a --- /dev/null +++ b/loss.py @@ -0,0 +1,63 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from torch.autograd import Variable +import torch.nn as nn +import torch.nn.functional as F +import torch as th + +class MaxMarginRankingLoss(nn.Module): + def __init__(self, margin=1): + super(MaxMarginRankingLoss, self).__init__() + self.loss = th.nn.MarginRankingLoss(margin) + self.margin = margin + + def forward(self,x): + n = x.size()[0] + #y = Variable(th.ones(2*n**2,1).cuda()) + + + x1 = th.diag(x) + x1 = x1.unsqueeze(1) + x1 = x1.expand(n, n) + x1 = x1.contiguous().view(-1,1) + x1 = th.cat((x1,x1),0) + + x2 = x.view(-1,1) + x3 = x.transpose(0,1).contiguous().view(-1,1) + + x2 = th.cat((x2,x3),0) + + max_margin = F.relu(self.margin - (x1 - x2)) + return max_margin.mean() + +class MaxMarginRankingLoss2(nn.Module): + def __init__(self, margin=1): + super(MaxMarginRankingLoss2, self).__init__() + self.margin = margin + + #x : BatchxL + def forward(self,x): + x1 = x[:,0] + x1 = x1.unsqueeze(1) + x1 = x1.expand(x.size()[0],x.size()[-1]-1) + + x2 = x[:,1:] + + max_margin = F.relu(self.margin - (x1 - x2)) + return max_margin.mean() + + diff --git a/loupe.py b/loupe.py new file mode 100644 index 0000000..54bd2d8 --- /dev/null +++ b/loupe.py @@ -0,0 +1,105 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from torch.autograd import Variable +import torch.nn as nn +import torch.nn.functional as F +import torch as th +import math + + +class NetVLAD(nn.Module): + def __init__(self, cluster_size, feature_size, add_batch_norm=True): + super(NetVLAD, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter((1/math.sqrt(feature_size)) + *th.randn(feature_size, cluster_size)) + self.clusters2 = nn.Parameter((1/math.sqrt(feature_size)) + *th.randn(1, feature_size, cluster_size)) + + self.add_batch_norm = add_batch_norm + self.batch_norm = nn.BatchNorm1d(cluster_size) + self.out_dim = cluster_size*feature_size + + def forward(self,x): + max_sample = x.size()[1] + x = x.view(-1,self.feature_size) + assignment = th.matmul(x,self.clusters) + + if self.add_batch_norm: + assignment = self.batch_norm(assignment) + + assignment = F.softmax(assignment,dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + a_sum = th.sum(assignment,-2,keepdim=True) + a = a_sum*self.clusters2 + + assignment = assignment.transpose(1,2) + + x = x.view(-1, max_sample, self.feature_size) + vlad = th.matmul(assignment, x) + vlad = vlad.transpose(1,2) + vlad = vlad - a + + # L2 intra norm + vlad = F.normalize(vlad) + + # flattening + L2 norm + vlad = vlad.view(-1, self.cluster_size*self.feature_size) + vlad = F.normalize(vlad) + + return vlad + +class NetRVLAD(nn.Module): + def __init__(self, cluster_size, feature_size, add_batch_norm=True): + super(NetRVLAD, self).__init__() + self.feature_size = feature_size + self.cluster_size = cluster_size + self.clusters = nn.Parameter((1/math.sqrt(feature_size)) + *th.randn(feature_size, cluster_size)) + + self.add_batch_norm = add_batch_norm + self.batch_norm = nn.BatchNorm1d(cluster_size) + self.out_dim = cluster_size*feature_size + + def forward(self,x): + max_sample = x.size()[1] + x = x.view(-1,self.feature_size) + assignment = th.matmul(x,self.clusters) + + if self.add_batch_norm: + assignment = self.batch_norm(assignment) + + assignment = F.softmax(assignment, dim=1) + assignment = assignment.view(-1, max_sample, self.cluster_size) + + assignment = assignment.transpose(1,2) + + x = x.view(-1, max_sample, self.feature_size) + rvlad = th.matmul(assignment, x) + rvlad = rvlad.transpose(-1,1) + + # L2 intra norm + rvlad = F.normalize(rvlad) + + # flattening + L2 norm + rvlad = rvlad.view(-1, self.cluster_size*self.feature_size) + rvlad = F.normalize(rvlad) + + return rvlad + diff --git a/model.py b/model.py new file mode 100644 index 0000000..d401ee7 --- /dev/null +++ b/model.py @@ -0,0 +1,151 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from torch.autograd import Variable +import torch.nn as nn +import torch.nn.functional as F +import torch as th +from loupe import NetVLAD +import numpy as np +from torch.autograd import Function + +class Net(nn.Module): + def __init__(self, video_modality_dim, text_dim, audio_cluster=8, text_cluster=32): + super(Net, self).__init__() + + self.audio_pooling = NetVLAD(feature_size=video_modality_dim['audio'][1], + cluster_size=audio_cluster) + self.text_pooling = NetVLAD(feature_size=text_dim, + cluster_size=text_cluster) + + self.mee = MEE(video_modality_dim, self.text_pooling.out_dim) + + def forward(self, text, video, ind, conf=True): + + aggregated_video = {} + + aggregated_video['audio'] = self.audio_pooling(video['audio']) + aggregated_video['face'] = video['face'] + aggregated_video['motion'] = video['motion'] + aggregated_video['visual'] = video['visual'] + + text = self.text_pooling(text) + + return self.mee(text, aggregated_video, ind, conf) + + +class MEE(nn.Module): + def __init__(self, video_modality_dim, text_dim): + super(MEE, self).__init__() + + m = video_modality_dim.keys() + + self.m = m + + self.video_GU = nn.ModuleList([Gated_Embedding_Unit(video_modality_dim[m[i]][0], + video_modality_dim[m[i]][1]) for i in range(len(m))]) + + self.text_GU = nn.ModuleList([Gated_Embedding_Unit(text_dim, + video_modality_dim[m[i]][1]) for i in range(len(m))]) + + self.moe_fc = nn.Linear(text_dim, len(video_modality_dim)) + + + def forward(self, text, video, ind, conf=True): + + text_embd = {} + + for i, l in enumerate(self.video_GU): + video[self.m[i]] = l(video[self.m[i]]) + + for i, l in enumerate(self.text_GU): + text_embd[self.m[i]] = l(text) + + + #MOE weights computation + normalization ------------ + moe_weights = self.moe_fc(text) + moe_weights = F.softmax(moe_weights, dim=1) + + available_m = np.zeros(moe_weights.size()) + + i = 0 + for m in video: + available_m[:,i] = ind[m] + i += 1 + + available_m = th.from_numpy(available_m).float() + available_m = Variable(available_m.cuda()) + + moe_weights = available_m*moe_weights + + norm_weights = th.sum(moe_weights, dim=1) + norm_weights = norm_weights.unsqueeze(1) + moe_weights = th.div(moe_weights, norm_weights) + + #MOE weights computation + normalization ------ DONE + + if conf: + conf_matrix = Variable(th.zeros(len(text),len(text)).cuda()) + i = 0 + for m in video: + video[m] = video[m].transpose(0,1) + conf_matrix += moe_weights[:,i:i+1]*th.matmul(text_embd[m], video[m]) + i += 1 + + return conf_matrix + else: + i = 0 + scores = Variable(th.zeros(len(text)).cuda()) + for m in video: + text_embd[m] = moe_weights[:,i:i+1]*text_embd[m]*video[m] + scores += th.sum(text_embd[m], dim=-1) + i += 1 + + return scores + +class Gated_Embedding_Unit(nn.Module): + def __init__(self, input_dimension, output_dimension): + super(Gated_Embedding_Unit, self).__init__() + + self.fc = nn.Linear(input_dimension, output_dimension) + self.cg = Context_Gating(output_dimension) + + def forward(self,x): + + x = self.fc(x) + x = self.cg(x) + x = F.normalize(x) + + return x + + +class Context_Gating(nn.Module): + def __init__(self, dimension, add_batch_norm=True): + super(Context_Gating, self).__init__() + self.fc = nn.Linear(dimension, dimension) + self.add_batch_norm = add_batch_norm + self.batch_norm = nn.BatchNorm1d(dimension) + + def forward(self,x): + x1 = self.fc(x) + + if self.add_batch_norm: + x1 = self.batch_norm(x1) + + x = th.cat((x, x1), 1) + + return F.glu(x,1) + + diff --git a/qcm_sampler.py b/qcm_sampler.py new file mode 100644 index 0000000..f3b176b --- /dev/null +++ b/qcm_sampler.py @@ -0,0 +1,36 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + + +import torch as th +from torch.utils.data.sampler import Sampler +import numpy as np + +class QCMSampler(Sampler): + + def __init__(self, n): + self.n = n + + def __iter__(self): + idx = np.arange(self.n) + idx = np.repeat(idx,5) + + idx2 = np.arange(self.n*5) + + return iter(zip(idx,idx2)) + + def __len__(self): + return self.n*5 diff --git a/train.py b/train.py new file mode 100644 index 0000000..850b5a5 --- /dev/null +++ b/train.py @@ -0,0 +1,376 @@ +# Copyright 2018 Antoine Miech All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +import torch as th +from torch.utils.data import Dataset, DataLoader +import LSMDC as LD2 +import MSRVTT as MSR +import numpy as np +import torch.optim as optim +import argparse +from loss import MaxMarginRankingLoss +from model import Net +from torch.autograd import Variable +import os +import random +from qcm_sampler import QCMSampler +from MSR_sampler import MSRSampler + + + +parser = argparse.ArgumentParser(description='LSMDC2017') + +parser.add_argument('--coco', type=bool, default=False, + help='add coco dataset') + +parser.add_argument('--lr', type=float, default=0.0001, + help='initial learning rate') +parser.add_argument('--epochs', type=int, default=50, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=128, + help='batch size') +parser.add_argument('--text_cluster_size', type=int, default=32, + help='Text cluster size') +parser.add_argument('--margin', type=float, default=0.2, + help='MaxMargin margin value') +parser.add_argument('--lr_decay', type=float, default=0.95, + help='Learning rate exp epoch decay') +parser.add_argument('--n_display', type=int, default=100, + help='Information display frequence') +parser.add_argument('--GPU', type=bool, default=True, + help='Use of GPU') +parser.add_argument('--n_cpu', type=int, default=1, + help='Number of CPU') + +parser.add_argument('--model_name', type=str, default='test', + help='Model name') +parser.add_argument('--seed', type=int, default=1, + help='Initial Random Seed') + +parser.add_argument('--optimizer', type=str, default='adam', + help='optimizer') +parser.add_argument('--momentum', type=float, default=0.9, + help='Nesterov Momentum for SGD') + + +parser.add_argument('--eval_qcm', type=bool, default=False, + help='Eval or not QCM') + +parser.add_argument('--eval_coco', type=bool, default=False, + help='Eval or not coco') + +parser.add_argument('--MSRVTT', type=bool, default=False, + help='MSRVTT') + +parser.add_argument('--coco_sampling_rate', type=float, default=1.0, + help='coco sampling rate') + + +args = parser.parse_args() + +print args + +root_feat = 'data' + +mp_visual_path = os.path.join(root_feat,'X_resnet.npy') +mp_flow_path = os.path.join(root_feat,'X_flow.npy') +mp_face_path = os.path.join(root_feat,'X_face.npy') + +def verbose(epoch, status, metrics, name='TEST'): + print(name+' - epoch: %d, epoch status: %.2f, r@1: %.3f, r@5: %.3f, r@10: %.3f, mr: %d' % + (epoch + 1, status, + metrics['R1'], metrics['R5'], metrics['R10'], + metrics['MR'])) + + +def compute_metric(x): + sx = np.sort(-x, axis=1) + d = np.diag(-x) + d = d[:,np.newaxis] + ind = sx - d + ind = np.where(ind == 0) + ind = ind[1] + + metrics = {} + metrics['R1'] = float(np.sum(ind == 0))/len(ind) + metrics['R5'] = float(np.sum(ind < 5))/len(ind) + metrics['R10'] = float(np.sum(ind < 10))/len(ind) + metrics['MR'] = np.median(ind) + 1 + + return metrics + +def make_tensor(l, max_len): + tensor = np.zeros((len(l),max_len,l[0].shape[-1])) + for i in range(len(l)): + if len(l[i]): + tensor[i,:min(max_len,l[i].shape[0]),:] = l[i][:min(max_len,l[i].shape[0])] + + return th.from_numpy(tensor).float() + +# predefining random initial seeds +th.manual_seed(args.seed) +np.random.seed(args.seed) +random.seed(args.seed) + +if args.eval_qcm and not(args.MSRVTT): + qcm_dataset = LD2.LSMDC_qcm(os.path.join(root_feat,'resnet-qcm.npy'), + os.path.join(root_feat,'w2v_LSMDC_qcm.npy'), os.path.join(root_feat,'X_audio_test.npy'), + os.path.join(root_feat,'flow-qcm.npy'), + os.path.join(root_feat,'face-qcm.npy')) + + qcm_sampler = QCMSampler(len(qcm_dataset)) + qcm_dataloader = DataLoader(qcm_dataset, batch_size=500, sampler=qcm_sampler, num_workers=1) + qcm_gt_fn = os.path.join(root_feat,'multiple_choice_gt.txt') + qcm_gt = [line.rstrip('\n') for line in open(qcm_gt_fn)] + qcm_gt = np.array(map(int,qcm_gt)) + +print 'Pre-loading features ... This may takes several minutes ...' + +if args.MSRVTT: + visual_feat_path = os.path.join(root_feat,'resnet_features.pickle') + flow_feat_path = os.path.join(root_feat,'flow_features.pickle') + text_feat_path = os.path.join(root_feat,'w2v_MSRVTT.pickle') + audio_feat_path = os.path.join(root_feat,'audio_features.pickle') + face_feat_path = os.path.join(root_feat,'face_features.pickle') + train_list_path = os.path.join(root_feat,'train_list.txt') + test_list_path = os.path.join(root_feat,'test_list.txt') + + dataset = MSR.MSRVTT(visual_feat_path, flow_feat_path, text_feat_path, + audio_feat_path, face_feat_path, train_list_path,test_list_path, coco=args.coco) + msr_sampler = MSRSampler(dataset.n_MSR,dataset.n_coco,args.coco_sampling_rate) + + if args.coco: + dataloader = DataLoader(dataset, batch_size=args.batch_size, + sampler=msr_sampler, num_workers=1,collate_fn=dataset.collate_data, drop_last=True) + else: + dataloader = DataLoader(dataset, batch_size=args.batch_size, + shuffle=True, num_workers=1,collate_fn=dataset.collate_data, drop_last=True) + +else: + path_to_text = os.path.join(root_feat,'w2v_LSMDC.npy') + path_to_audio = os.path.join(root_feat,'X_audio_train.npy') + + dataset = LD2.LSMDC(mp_visual_path, path_to_text, + path_to_audio, mp_flow_path, mp_face_path, coco=args.coco) + dataloader = DataLoader(dataset, batch_size=args.batch_size, + shuffle=True, num_workers=1, drop_last=True) + print 'Done.' + + print 'Reading test data ...' + resnet_features_path = os.path.join(root_feat,'resnet152-retrieval.npy.tensor.npy') + flow_features_path = os.path.join(root_feat,'flow-retrieval.npy.tensor.npy') + face_features_path = os.path.join(root_feat,'face-retrieval.npy.tensor.npy') + text_features_path = os.path.join(root_feat,'w2v_LSMDC_retrieval.npy') + audio_features_path = os.path.join(root_feat,'X_audio_retrieval.npy.tensor.npy') + + vid_retrieval = np.load(resnet_features_path) + flow_retrieval = np.load(flow_features_path) + face_retrieval = np.load(face_features_path) + text_retrieval = np.load(text_features_path) + audio_retrieval = np.load(audio_features_path) + + mm = max(map(len,text_retrieval)) + + text_retrieval = make_tensor(text_retrieval,mm) + + vid_retrieval = th.from_numpy(vid_retrieval).float() + flow_retrieval = th.from_numpy(flow_retrieval).float() + face_retrieval = th.from_numpy(face_retrieval).float() + audio_retrieval = th.from_numpy(audio_retrieval).float() + + text_retrieval_val = text_retrieval + vid_retrieval_val = vid_retrieval + flow_retrieval_val = flow_retrieval + face_retrieval_val = face_retrieval + audio_retrieval_val = audio_retrieval + + + face_ind_test = np.load(os.path.join(root_feat,'no_face_ind_retrieval.npy')) + face_ind_test = 1 - face_ind_test +print 'Done.' + +# Model +video_modality_dim = {'face': (128,128), 'audio': (128*16,128), +'visual': (2048,2048), 'motion': (1024,1024)} +net = Net(video_modality_dim,300, + audio_cluster=16,text_cluster=args.text_cluster_size) +net.train() + +if args.GPU: + net.cuda() + +# Optimizers + Loss +max_margin = MaxMarginRankingLoss(margin=args.margin) + + +if args.optimizer == 'adam': + optimizer = optim.Adam(net.parameters(), lr=args.lr) +elif args.optimizer == 'sgd': + optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum) + +if args.GPU: + max_margin.cuda() + +n_display = args.n_display +dataset_size = len(dataset) +lr_decay = args.lr_decay + +print 'Starting training loop ...' +print 'Parameters: lr: %f, epochs: %d, seed: %d, batch_size: %d'%(args.lr, + args.epochs, th.initial_seed(), args.batch_size) + +for epoch in range(args.epochs): + running_loss = 0.0 + print 'epoch: %d'%epoch + + for i_batch, sample_batched in enumerate(dataloader): + + if args.MSRVTT: + captions = sample_batched['text'] + audio = sample_batched['audio'] + else: + captions = dataset.shorteningTextTensor(sample_batched['text'], + sample_batched['text_size']) + + audio = dataset.shorteningTextTensor(sample_batched['audio'], + sample_batched['audio_size']) + + + face = sample_batched['face'] + video = sample_batched['video'] + flow = sample_batched['flow'] + coco_ind = sample_batched['coco_ind'] + face_ind = sample_batched['face_ind'] + + ind = {} + ind['face'] = face_ind + ind['visual'] = np.ones((len(face_ind))) + ind['motion'] = 1 - coco_ind + ind['audio'] = 1 - coco_ind + + if args.GPU: + captions, video = Variable(captions.cuda()), Variable(video.cuda()) + audio, flow = Variable(audio.cuda()), Variable(flow.cuda()) + face = Variable(face.cuda()) + + + optimizer.zero_grad() + confusion_matrix = net(captions, + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + loss = max_margin(confusion_matrix) + loss.backward() + + optimizer.step() + running_loss += loss.data[0] + + if (i_batch+1) % n_display == 0: + print 'Epoch %d, Epoch status: %.2f, Training loss: %.4f'%(epoch + 1, + args.batch_size*float(i_batch)/dataset_size,running_loss/n_display) + running_loss = 0.0 + + print 'evaluating epoch %d ...'%(epoch+1) + net.eval() + + if args.MSRVTT: + retrieval_samples = dataset.getRetrievalSamples() + + video = Variable(retrieval_samples['video'].cuda(), volatile=True) + captions = Variable(retrieval_samples['text'].cuda(), volatile=True) + audio = Variable(retrieval_samples['audio'].cuda(), volatile=True) + flow = Variable(retrieval_samples['flow'].cuda(), volatile=True) + face = Variable(retrieval_samples['face'].cuda(), volatile=True) + face_ind = retrieval_samples['face_ind'] + + ind = {} + ind['face'] = face_ind + ind['visual'] = np.ones((len(face_ind))) + ind['motion'] = np.ones((len(face_ind))) + ind['audio'] = np.ones((len(face_ind))) + + conf = net(captions, + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + confusion_matrix = conf.data.cpu().float().numpy() + metrics = compute_metric(confusion_matrix) + verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MSRVTT') + + else: + video = Variable(vid_retrieval_val.cuda(), volatile=True) + captions = Variable(text_retrieval_val.cuda(), volatile=True) + audio = Variable(audio_retrieval_val.cuda(), volatile=True) + flow = Variable(flow_retrieval_val.cuda(), volatile=True) + face = Variable(face_retrieval_val.cuda(), volatile=True) + + ind = {} + ind['face'] = face_ind_test + ind['visual'] = np.ones((len(face_ind_test))) + ind['motion'] = np.ones((len(face_ind_test))) + ind['audio'] = np.ones((len(face_ind_test))) + + conf = net(captions, + {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True) + confusion_matrix = conf.data.cpu().float().numpy() + metrics = compute_metric(confusion_matrix) + verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MPII') + + net.train() + + if args.eval_qcm: + print 'LSMDC Multiple-Choice evaluation (accuracy)' + net.eval() + scores = [] + + for i_batch, sample_batched in enumerate(qcm_dataloader): + captions = sample_batched['text'] + + audio = qcm_dataset.shorteningTextTensor(sample_batched['audio'], + sample_batched['audio_size']) + + video = sample_batched['video'] + flow = sample_batched['flow'] + face = sample_batched['face'] + face_ind = sample_batched['face_ind'] + + ind = {} + ind['face'] = face_ind + ind['visual'] = np.ones((len(face_ind))) + ind['motion'] = np.ones((len(face_ind))) + ind['audio'] = np.ones((len(face_ind))) + + if args.GPU: + captions, video = Variable(captions.cuda(), volatile=True), Variable(video.cuda(), volatile=True) + audio, flow = Variable(audio.cuda(), volatile=True), Variable(flow.cuda(), volatile=True) + face = Variable(face.cuda(), volatile=True) + + s = net(captions, {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, False) + s = s.data.cpu().float().numpy() + scores.extend(s) + + scores = np.array(scores) + scores = np.reshape(scores, (len(qcm_dataset),5)) + pred = np.argmax(scores, axis=1)+1 + + accuracy_qcm = sum(pred == qcm_gt)/float(len(pred)) + print 'Accuracy Multiple-Choice: %.3f'%accuracy_qcm + + net.train() + + for param_group in optimizer.param_groups: + param_group['lr'] *= lr_decay + + + + +