From b342e5477fc5bbf82c349461bfa9a3288098ba0a Mon Sep 17 00:00:00 2001
From: "Antoine Miech  \"WILLOW" <amiech@sequoia.cm.cluster>
Date: Wed, 4 Apr 2018 17:27:50 +0200
Subject: [PATCH] first code commit

---
 LICENSE        | 202 ++++++++++++++++++++++++++
 LSMDC.py       | 278 ++++++++++++++++++++++++++++++++++++
 MSRVTT.py      | 190 +++++++++++++++++++++++++
 MSR_sampler.py |  41 ++++++
 loss.py        |  63 +++++++++
 loupe.py       | 105 ++++++++++++++
 model.py       | 151 ++++++++++++++++++++
 qcm_sampler.py |  36 +++++
 train.py       | 376 +++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 1442 insertions(+)
 create mode 100644 LICENSE
 create mode 100644 LSMDC.py
 create mode 100644 MSRVTT.py
 create mode 100644 MSR_sampler.py
 create mode 100644 loss.py
 create mode 100644 loupe.py
 create mode 100644 model.py
 create mode 100644 qcm_sampler.py
 create mode 100644 train.py

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..2325b40
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018 Antoine Miech
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LSMDC.py b/LSMDC.py
new file mode 100644
index 0000000..2523ffa
--- /dev/null
+++ b/LSMDC.py
@@ -0,0 +1,278 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import torch as th
+from torch.utils.data import Dataset
+import numpy as np 
+import os    
+import math    
+import random
+
+class LSMDC(Dataset):
+    """LSMDC dataset."""
+
+    def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy', coco=True, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024,verbose=False):
+        """
+        Args:
+        """
+
+        self.visual_features = np.load(clip_path)
+        self.flow_features = np.load(flow_path)
+        self.face_features = np.load(face_path)
+        self.audio_features = np.load(audio_features)
+        self.text_features = np.load(text_features)
+        
+
+        audio_sizes = map(len,self.audio_features)
+        self.audio_sizes = np.array(audio_sizes)
+
+        self.video_features_size = video_features_size
+        self.text_features_size = text_features_size
+        self.audio_features_size = audio_features_size
+        self.flow_features_size = flow_features_size
+        self.face_features_size = face_features_size
+        
+        self.max_len_text = max_words
+        
+        text_sizes = map(len,self.text_features)
+        self.text_sizes = np.array(text_sizes)
+        self.text_sizes = self.text_sizes.astype(int)
+        
+        mask = self.text_sizes > 0
+
+        self.text_features = self.text_features[mask]
+        self.text_sizes = self.text_sizes[mask]
+        self.visual_features = self.visual_features[mask]
+        self.flow_features = self.flow_features[mask]
+        self.face_features = self.face_features[mask]
+        self.audio_features = self.audio_features[mask]
+        self.audio_sizes = self.audio_sizes[mask]
+        self.audio_sizes.astype(int) 
+        
+        self.max_len_audio = max(self.audio_sizes)
+       
+        audio_tensors = np.zeros((len(self.audio_features),
+                        max(self.audio_sizes), self.audio_features[0].shape[1]))
+
+        for j in range(len(self.audio_features)):
+            audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j]
+        
+
+        if coco:
+            # adding coco data
+            coco_visual = np.load(coco_visual_path)
+            coco_text = np.load(coco_text_path)
+            
+
+            self.n_lsmdc = len(self.visual_features)
+            self.n_coco = len(coco_visual)
+           
+            self.visual_features = np.concatenate((self.visual_features, coco_visual), axis=0)
+            self.text_features = np.concatenate((self.text_features, coco_text), axis=0)
+
+            text_sizes = map(len,self.text_features)
+            self.text_sizes = np.array(text_sizes)
+            self.text_sizes = self.text_sizes.astype(int)
+            self.coco_ind = np.zeros((self.n_lsmdc+self.n_coco))
+            self.coco_ind[self.n_lsmdc:] = 1
+        else:
+            self.n_lsmdc = len(self.visual_features)
+            self.coco_ind = np.zeros((self.n_lsmdc))
+
+      
+        text_tensors = np.zeros((len(self.text_features),
+                        max_words, self.text_features[0].shape[1]))
+
+
+        for j in range(len(self.text_features)):
+            if self.text_sizes[j] > max_words:                
+                text_tensors[j] = self.text_features[j][0:max_words,:]
+            else:
+                text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] 
+                
+        self.text_features = th.from_numpy(text_tensors)
+        self.text_features = self.text_features.float()
+
+        self.audio_features = th.from_numpy(audio_tensors)
+        self.audio_features = self.audio_features.float()
+
+        self.flow_features = th.from_numpy(self.flow_features)
+        self.flow_features = self.flow_features.float()
+
+        self.visual_features = th.from_numpy(self.visual_features)
+        self.visual_features = self.visual_features.float()
+
+        self.face_features = th.from_numpy(self.face_features)
+        self.face_features = self.face_features.float()
+        
+    def __len__(self):
+        return len(self.text_features)
+
+    def __getitem__(self, idx):
+
+        face_ind = 1
+
+        if idx >= self.n_lsmdc:
+            flow = th.zeros(self.flow_features_size)
+            face = th.zeros(self.face_features_size)
+            audio = th.zeros(self.audio_features.size()[1],self.audio_features_size)
+            audio_size = 1
+            face_ind = 0
+        else:
+            flow = self.flow_features[idx]
+            face = self.face_features[idx]
+            audio = self.audio_features[idx]
+            audio_size = self.audio_sizes[idx]
+
+            if th.sum(face) == 0:
+                face_ind = 0
+        return {'video': self.visual_features[idx], 
+                'flow': flow,
+                'face': face,
+                'text': self.text_features[idx],
+                'audio': audio,
+                'audio_size': audio_size,
+                'coco_ind': self.coco_ind[idx],
+                'face_ind': face_ind,
+                'text_size': self.text_sizes[idx]
+                }
+
+
+    def getVideoFeatureSize(self):
+        return self.video_features_size
+    def getTextFeatureSize(self):
+        return self.text_features_size
+    def getAudioFeatureSize(self):
+        return self.audio_features_size
+    def getFlowFeatureSize(self):
+        return self.flow_features_size
+    def getText(self):
+        return self.text_features
+ 
+
+    def shorteningTextTensor(self,text_features, text_sizes):
+        m = int(max(text_sizes))
+        return text_features[:,0:m,:]
+
+class LSMDC_qcm(Dataset):
+    """LSMDC dataset."""
+
+    def __init__(self, clip_path, text_features, audio_features, flow_path, face_path, max_words=30, video_features_size=2048, text_features_size=300, audio_features_size=128, face_features_size=128, flow_features_size=1024):
+        """
+        Args:
+        """
+        self.visual_features = np.load(clip_path)
+        self.flow_features = np.load(flow_path)
+        self.face_features = np.load(face_path)
+        self.audio_features = np.load(audio_features)
+        self.text_features = np.load(text_features)
+        print 'features loaded'
+
+        audio_sizes = map(len,self.audio_features)
+        self.audio_sizes = np.array(audio_sizes)
+
+        self.video_features_size = video_features_size
+        self.text_features_size = text_features_size
+        self.audio_features_size = audio_features_size
+        self.flow_features_size = flow_features_size
+        self.face_features_size = face_features_size
+        
+        self.max_len_text = max_words
+        
+        text_sizes = map(len,self.text_features)
+        self.text_sizes = np.array(text_sizes)
+        self.text_sizes = self.text_sizes.astype(int)
+        
+       
+        self.max_len_audio = max(self.audio_sizes)
+        
+
+        audio_tensors = np.zeros((len(self.audio_features),
+                        max(self.audio_sizes), self.audio_features[0].shape[1]))
+
+        for j in range(len(self.audio_features)):
+            audio_tensors[j,0:self.audio_sizes[j],:] = self.audio_features[j]
+
+        text_tensors = np.zeros((len(self.text_features),
+                        max_words, self.text_features[0].shape[1]))
+
+
+        for j in range(len(self.text_features)):
+            if self.text_sizes[j] > max_words:                
+                text_tensors[j] = self.text_features[j][0:max_words,:]
+            else:
+                text_tensors[j,0:self.text_sizes[j],:] = self.text_features[j] 
+                
+        self.text_features = th.from_numpy(text_tensors)
+        self.text_features = self.text_features.float()
+
+        self.audio_features = th.from_numpy(audio_tensors)
+        self.audio_features = self.audio_features.float()
+
+        self.flow_features = th.from_numpy(self.flow_features)
+        self.flow_features = self.flow_features.float()
+
+        self.visual_features = th.from_numpy(self.visual_features)
+        self.visual_features = self.visual_features.float()
+
+        self.face_features = th.from_numpy(self.face_features)
+        self.face_features = self.face_features.float()
+
+
+    def __len__(self):
+        return len(self.visual_features)
+    
+
+
+    def __getitem__(self, tidx):
+    
+        idx, idx2 = tidx
+
+        face_ind = 1
+
+        flow = self.flow_features[idx]
+        face = self.face_features[idx]
+        audio = self.audio_features[idx]
+        audio_size = self.audio_sizes[idx]
+
+        if th.sum(face) == 0:
+            face_ind = 0
+
+        return {'video': self.visual_features[idx], 
+                'flow': flow,
+                'face': face,
+                'text': self.text_features[idx2],
+                'audio': audio,
+                'face_ind': face_ind,
+                'audio_size': audio_size,
+                'text_size': self.text_sizes[idx2]
+                }
+
+
+    def getVideoFeatureSize(self):
+        return self.video_features_size
+    def getTextFeatureSize(self):
+        return self.text_features_size
+    def getAudioFeatureSize(self):
+        return self.audio_features_size
+    def getFlowFeatureSize(self):
+        return self.flow_features_size
+
+
+    def shorteningTextTensor(self,text_features, text_sizes):
+        m = int(max(text_sizes))
+        return text_features[:,0:m,:]
+
diff --git a/MSRVTT.py b/MSRVTT.py
new file mode 100644
index 0000000..c51abfd
--- /dev/null
+++ b/MSRVTT.py
@@ -0,0 +1,190 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import torch as th
+from torch.utils.data import Dataset
+import numpy as np 
+import os    
+import math    
+import random
+import pickle
+
+class MSRVTT(Dataset):
+    """LSMDC dataset."""
+
+    def __init__(self, visual_features, flow_features, text_features, audio_features, face_features, train_list, test_list, coco_visual_path='../X_train2014_resnet152.npy' ,coco_text_path='../w2v_coco_train2014_1.npy',coco=True, max_words=30,verbose=False):
+        """
+        Args:
+        """
+        self.max_words = max_words
+        print 'loading data ...'
+
+        with open(train_list) as f:
+            self.train_list = f.readlines()
+
+        self.train_list = [x.strip() for x in self.train_list]
+
+        with open(test_list) as f:
+            self.test_list = f.readlines()
+
+        self.test_list = [x.strip() for x in self.test_list]
+
+
+        pickle_in = open(visual_features,'rb')
+        self.visual_features = pickle.load(pickle_in)
+  
+        pickle_in = open(flow_features,'rb')
+        self.flow_features = pickle.load(pickle_in)
+
+        pickle_in = open(audio_features,'rb')
+        self.audio_features = pickle.load(pickle_in)
+
+        pickle_in = open(text_features,'rb')
+        self.text_features = pickle.load(pickle_in)
+
+        pickle_in = open(face_features,'rb')
+        self.face_features = pickle.load(pickle_in)
+
+        self.coco = coco
+
+        if coco:
+            # adding coco data
+            self.coco_visual = np.load(coco_visual_path)
+            self.coco_text = np.load(coco_text_path)
+            
+            self.n_MSR = len(self.train_list)
+            self.n_coco = len(self.coco_visual)
+            
+            self.coco_ind = np.zeros((self.n_MSR+self.n_coco))
+            self.coco_ind[self.n_MSR:] = 1
+
+        else:
+            self.n_MSR = len(self.train_list)
+            self.coco_ind = np.zeros((self.n_MSR))
+ 
+
+        # computing retrieval
+
+        self.video_retrieval = np.zeros((len(self.test_list),2048))
+        self.flow_retrieval = np.zeros((len(self.test_list),1024))
+        self.audio_retrieval = np.zeros((len(self.test_list), max_words, 128))
+        self.face_retrieval = np.zeros((len(self.test_list), 128))
+        self.text_retrieval = np.zeros((len(self.test_list), max_words, 300))
+        self.face_ind_retrieval = np.ones((len(self.test_list)))
+        
+        for i in range(len(self.test_list)):
+            self.video_retrieval[i] = self.visual_features[self.test_list[i]]
+            self.flow_retrieval[i] = self.flow_features[self.test_list[i]]
+            
+            if len(self.face_features[self.test_list[i]]) > 0:
+                self.face_retrieval[i] = self.face_features[self.test_list[i]]
+       
+            if np.sum(self.face_retrieval[i]) == 0:
+                self.face_ind_retrieval[i] = 0
+
+            la = len(self.audio_features[self.test_list[i]])
+            self.audio_retrieval[i,:min(max_words,la),:] = self.audio_features[self.test_list[i]][:min(max_words,la)]
+            
+            lt = len(self.text_features[self.test_list[i]][0])
+            self.text_retrieval[i,:min(max_words,lt),:] = self.text_features[self.test_list[i]][0][:min(max_words,lt)]
+
+        
+        self.video_retrieval = th.from_numpy(self.video_retrieval).float()
+        self.flow_retrieval = th.from_numpy(self.flow_retrieval).float()
+        self.audio_retrieval = th.from_numpy(self.audio_retrieval).float()
+        self.face_retrieval = th.from_numpy(self.face_retrieval).float()
+        self.text_retrieval = th.from_numpy(self.text_retrieval).float()
+        
+        print 'done'
+
+    def collate_data(self, data):
+        video_tensor = np.zeros((len(data), 2048))
+        flow_tensor = np.zeros((len(data), 1024))
+        face_tensor = np.zeros((len(data), 128))
+        audio_tensor = np.zeros((len(data), self.max_words,128))
+        text_tensor = np.zeros((len(data), self.max_words, 300))
+        coco_ind = np.zeros((len(data)))
+        face_ind = np.zeros((len(data)))
+
+        for i in range(len(data)):
+
+            coco_ind[i] = data[i]['coco_ind']
+            face_ind[i] = data[i]['face_ind']
+            video_tensor[i] = data[i]['video']
+            flow_tensor[i] = data[i]['flow']
+
+            if len(data[i]['face']) > 0:
+                face_tensor[i] = data[i]['face']
+            
+            la = len(data[i]['audio'])
+            audio_tensor[i,:min(la,self.max_words), :] = data[i]['audio'][:min(self.max_words,la)]
+
+            lt = len(data[i]['text'])
+            text_tensor[i,:min(lt,self.max_words), :] = data[i]['text'][:min(self.max_words,lt)]
+
+
+        return {'video': th.from_numpy(video_tensor).float(),
+                'flow': th.from_numpy(flow_tensor).float(),
+                'face': th.from_numpy(face_tensor).float(),
+                'coco_ind': coco_ind,
+                'face_ind': face_ind,
+                'text': th.from_numpy(text_tensor).float(),
+                'audio': th.from_numpy(audio_tensor).float()}
+
+
+    def __len__(self):
+        return len(self.coco_ind)
+
+    def __getitem__(self, idx):
+
+        face_ind = 1
+        if idx < self.n_MSR:
+            vid = self.train_list[idx]
+            text = self.text_features[vid]
+            r = random.randint(0, len(text)-1)
+            text = text[r]
+            flow = self.flow_features[vid]
+            audio = self.audio_features[vid]
+            video = self.visual_features[vid]
+            face = self.face_features[vid]
+
+            if np.sum(face) == 0:
+                face_ind = 0
+        elif self.coco:
+            video = self.coco_visual[idx-self.n_MSR]
+            text = self.coco_text[idx-self.n_MSR]
+            audio = th.zeros(1,128)
+            flow = th.zeros(1024)
+            face = th.zeros(128)
+            face_ind = 0
+
+        return {'video': video, 
+                'flow': flow,
+                'face': face,
+                'text': text,
+                'coco_ind': self.coco_ind[idx],
+                'face_ind': face_ind,
+                'audio': audio
+                }
+
+    def getRetrievalSamples(self):
+        return {'video': self.video_retrieval, 
+                'flow': self.flow_retrieval,
+                'text': self.text_retrieval,
+                'face': self.face_retrieval,
+                'face_ind': self.face_ind_retrieval,
+                'audio': self.audio_retrieval}
+
diff --git a/MSR_sampler.py b/MSR_sampler.py
new file mode 100644
index 0000000..eb6c285
--- /dev/null
+++ b/MSR_sampler.py
@@ -0,0 +1,41 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import torch as th
+from torch.utils.data.sampler import Sampler
+import numpy as np
+
+class MSRSampler(Sampler):
+
+    def __init__(self, n_MSR, n_COCO, sampling_rate):
+        self.n_MSR = n_MSR
+        self.n_COCO = n_COCO
+        self.sampling_rate = sampling_rate
+
+    def __iter__(self):
+        idx_MSR = np.arange(self.n_MSR)
+        idx_coco = np.arange(self.n_MSR,self.n_MSR+self.n_COCO)
+
+        np.random.shuffle(idx_coco)
+        idx_coco = idx_coco[:min(self.n_COCO,int(self.sampling_rate*self.n_MSR))]
+
+        idx = np.concatenate((idx_MSR,idx_coco), axis=0)
+        np.random.shuffle(idx)
+
+        return iter(idx)
+
+    def __len__(self):
+        return self.n_MSR+int(self.sampling_rate*self.n_COCO)
diff --git a/loss.py b/loss.py
new file mode 100644
index 0000000..2189a0a
--- /dev/null
+++ b/loss.py
@@ -0,0 +1,63 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.functional as F
+import torch as th
+
+class MaxMarginRankingLoss(nn.Module):
+    def __init__(self, margin=1):
+        super(MaxMarginRankingLoss, self).__init__()
+        self.loss = th.nn.MarginRankingLoss(margin)
+        self.margin = margin
+
+    def forward(self,x):
+        n = x.size()[0]
+        #y = Variable(th.ones(2*n**2,1).cuda())
+
+
+        x1 = th.diag(x)
+        x1 = x1.unsqueeze(1)
+        x1 = x1.expand(n, n)
+        x1 = x1.contiguous().view(-1,1)
+        x1 = th.cat((x1,x1),0) 
+
+        x2 = x.view(-1,1)
+        x3 = x.transpose(0,1).contiguous().view(-1,1)
+       
+        x2 = th.cat((x2,x3),0)
+         
+        max_margin = F.relu(self.margin - (x1 - x2))
+        return max_margin.mean()
+
+class MaxMarginRankingLoss2(nn.Module):
+    def __init__(self, margin=1):
+        super(MaxMarginRankingLoss2, self).__init__()
+        self.margin = margin
+
+    #x : BatchxL
+    def forward(self,x):
+        x1 = x[:,0]
+        x1 = x1.unsqueeze(1)
+        x1 = x1.expand(x.size()[0],x.size()[-1]-1)
+
+        x2 = x[:,1:]
+       
+        max_margin = F.relu(self.margin - (x1 - x2))
+        return max_margin.mean()
+
+ 
diff --git a/loupe.py b/loupe.py
new file mode 100644
index 0000000..54bd2d8
--- /dev/null
+++ b/loupe.py
@@ -0,0 +1,105 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.functional as F
+import torch as th
+import math
+
+
+class NetVLAD(nn.Module):
+    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
+        super(NetVLAD, self).__init__()
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.clusters = nn.Parameter((1/math.sqrt(feature_size))
+                *th.randn(feature_size, cluster_size))
+        self.clusters2 = nn.Parameter((1/math.sqrt(feature_size))
+                *th.randn(1, feature_size, cluster_size))
+
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1d(cluster_size)
+        self.out_dim = cluster_size*feature_size
+
+    def forward(self,x):
+        max_sample = x.size()[1]
+        x = x.view(-1,self.feature_size)
+        assignment = th.matmul(x,self.clusters)
+
+        if self.add_batch_norm:
+            assignment = self.batch_norm(assignment)
+
+        assignment = F.softmax(assignment,dim=1)
+        assignment = assignment.view(-1, max_sample, self.cluster_size)
+
+        a_sum = th.sum(assignment,-2,keepdim=True)
+        a = a_sum*self.clusters2
+
+        assignment = assignment.transpose(1,2)
+
+        x = x.view(-1, max_sample, self.feature_size)
+        vlad = th.matmul(assignment, x)
+        vlad = vlad.transpose(1,2)
+        vlad = vlad - a
+
+        # L2 intra norm
+        vlad = F.normalize(vlad)
+        
+        # flattening + L2 norm
+        vlad = vlad.view(-1, self.cluster_size*self.feature_size)
+        vlad = F.normalize(vlad)
+
+        return vlad
+
+class NetRVLAD(nn.Module):
+    def __init__(self, cluster_size, feature_size, add_batch_norm=True):
+        super(NetRVLAD, self).__init__()
+        self.feature_size = feature_size
+        self.cluster_size = cluster_size
+        self.clusters = nn.Parameter((1/math.sqrt(feature_size))
+                *th.randn(feature_size, cluster_size))
+
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1d(cluster_size)
+        self.out_dim = cluster_size*feature_size
+
+    def forward(self,x):
+        max_sample = x.size()[1]
+        x = x.view(-1,self.feature_size)
+        assignment = th.matmul(x,self.clusters)
+
+        if self.add_batch_norm:
+            assignment = self.batch_norm(assignment)
+
+        assignment = F.softmax(assignment, dim=1)
+        assignment = assignment.view(-1, max_sample, self.cluster_size)
+
+        assignment = assignment.transpose(1,2)
+
+        x = x.view(-1, max_sample, self.feature_size)
+        rvlad = th.matmul(assignment, x)
+        rvlad = rvlad.transpose(-1,1)
+
+        # L2 intra norm
+        rvlad = F.normalize(rvlad)
+        
+        # flattening + L2 norm
+        rvlad = rvlad.view(-1, self.cluster_size*self.feature_size)
+        rvlad = F.normalize(rvlad)
+
+        return rvlad
+
diff --git a/model.py b/model.py
new file mode 100644
index 0000000..d401ee7
--- /dev/null
+++ b/model.py
@@ -0,0 +1,151 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.functional as F
+import torch as th
+from loupe import NetVLAD
+import numpy as np
+from torch.autograd import Function
+
+class Net(nn.Module):
+    def __init__(self, video_modality_dim, text_dim, audio_cluster=8,  text_cluster=32):
+        super(Net, self).__init__()
+        
+        self.audio_pooling = NetVLAD(feature_size=video_modality_dim['audio'][1],
+                cluster_size=audio_cluster)
+        self.text_pooling = NetVLAD(feature_size=text_dim,
+                cluster_size=text_cluster)
+
+        self.mee = MEE(video_modality_dim, self.text_pooling.out_dim)
+
+    def forward(self, text, video, ind, conf=True):
+
+        aggregated_video = {}
+        
+        aggregated_video['audio'] = self.audio_pooling(video['audio'])
+        aggregated_video['face'] = video['face'] 
+        aggregated_video['motion'] = video['motion']
+        aggregated_video['visual'] = video['visual']
+        
+        text = self.text_pooling(text)
+
+        return self.mee(text, aggregated_video, ind, conf)
+
+
+class MEE(nn.Module):
+    def __init__(self, video_modality_dim, text_dim):
+        super(MEE, self).__init__()
+
+        m = video_modality_dim.keys()
+
+        self.m = m
+        
+        self.video_GU = nn.ModuleList([Gated_Embedding_Unit(video_modality_dim[m[i]][0],
+            video_modality_dim[m[i]][1]) for i in range(len(m))])
+
+        self.text_GU = nn.ModuleList([Gated_Embedding_Unit(text_dim,
+            video_modality_dim[m[i]][1]) for i in range(len(m))])
+
+        self.moe_fc = nn.Linear(text_dim, len(video_modality_dim))
+    
+
+    def forward(self, text, video, ind, conf=True):
+
+        text_embd = {}
+
+        for i, l in enumerate(self.video_GU):
+            video[self.m[i]] = l(video[self.m[i]])
+
+        for i, l in enumerate(self.text_GU):
+            text_embd[self.m[i]] = l(text)
+
+
+        #MOE weights computation + normalization ------------
+        moe_weights = self.moe_fc(text)
+        moe_weights = F.softmax(moe_weights, dim=1)
+
+        available_m = np.zeros(moe_weights.size())
+
+        i = 0
+        for m in video:
+            available_m[:,i] = ind[m]
+            i += 1
+
+        available_m = th.from_numpy(available_m).float()
+        available_m = Variable(available_m.cuda())
+
+        moe_weights = available_m*moe_weights
+
+        norm_weights = th.sum(moe_weights, dim=1)
+        norm_weights = norm_weights.unsqueeze(1)
+        moe_weights = th.div(moe_weights, norm_weights)
+
+        #MOE weights computation + normalization ------ DONE
+
+        if conf:
+            conf_matrix = Variable(th.zeros(len(text),len(text)).cuda())
+            i = 0
+            for m in video:
+                video[m] = video[m].transpose(0,1)
+                conf_matrix += moe_weights[:,i:i+1]*th.matmul(text_embd[m], video[m])
+                i += 1
+
+            return conf_matrix
+        else:
+            i = 0
+            scores = Variable(th.zeros(len(text)).cuda())
+            for m in video:
+                text_embd[m] = moe_weights[:,i:i+1]*text_embd[m]*video[m]
+                scores += th.sum(text_embd[m], dim=-1)
+                i += 1
+             
+            return scores
+
+class Gated_Embedding_Unit(nn.Module):
+    def __init__(self, input_dimension, output_dimension):
+        super(Gated_Embedding_Unit, self).__init__()
+
+        self.fc = nn.Linear(input_dimension, output_dimension)
+        self.cg = Context_Gating(output_dimension)
+  
+    def forward(self,x):
+        
+        x = self.fc(x)
+        x = self.cg(x)
+        x = F.normalize(x)
+
+        return x
+
+
+class Context_Gating(nn.Module):
+    def __init__(self, dimension, add_batch_norm=True):
+        super(Context_Gating, self).__init__()
+        self.fc = nn.Linear(dimension, dimension)
+        self.add_batch_norm = add_batch_norm
+        self.batch_norm = nn.BatchNorm1d(dimension)
+        
+    def forward(self,x):
+        x1 = self.fc(x)
+
+        if self.add_batch_norm:
+            x1 = self.batch_norm(x1) 
+
+        x = th.cat((x, x1), 1)
+        
+        return F.glu(x,1)
+
+
diff --git a/qcm_sampler.py b/qcm_sampler.py
new file mode 100644
index 0000000..f3b176b
--- /dev/null
+++ b/qcm_sampler.py
@@ -0,0 +1,36 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+
+import torch as th
+from torch.utils.data.sampler import Sampler
+import numpy as np
+
+class QCMSampler(Sampler):
+
+    def __init__(self, n):
+        self.n = n
+
+    def __iter__(self):
+        idx = np.arange(self.n)
+        idx = np.repeat(idx,5)
+        
+        idx2 = np.arange(self.n*5)
+
+        return iter(zip(idx,idx2))
+
+    def __len__(self):
+        return self.n*5
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..850b5a5
--- /dev/null
+++ b/train.py
@@ -0,0 +1,376 @@
+# Copyright 2018 Antoine Miech All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import torch as th
+from torch.utils.data import Dataset, DataLoader
+import LSMDC as LD2
+import MSRVTT as MSR
+import numpy as np
+import torch.optim as optim
+import argparse
+from loss import MaxMarginRankingLoss
+from model import Net
+from torch.autograd import Variable
+import os
+import random
+from qcm_sampler import QCMSampler
+from MSR_sampler import MSRSampler
+
+
+
+parser = argparse.ArgumentParser(description='LSMDC2017')
+
+parser.add_argument('--coco', type=bool, default=False,
+                            help='add coco dataset')
+
+parser.add_argument('--lr', type=float, default=0.0001,
+                            help='initial learning rate')
+parser.add_argument('--epochs', type=int, default=50,
+                            help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=128,
+                            help='batch size')
+parser.add_argument('--text_cluster_size', type=int, default=32,
+                            help='Text cluster size')
+parser.add_argument('--margin', type=float, default=0.2,
+                            help='MaxMargin margin value')
+parser.add_argument('--lr_decay', type=float, default=0.95,
+                            help='Learning rate exp epoch decay')
+parser.add_argument('--n_display', type=int, default=100,
+                            help='Information display frequence')
+parser.add_argument('--GPU', type=bool, default=True,
+                            help='Use of GPU')
+parser.add_argument('--n_cpu', type=int, default=1,
+                            help='Number of CPU')
+
+parser.add_argument('--model_name', type=str, default='test',
+                            help='Model name')
+parser.add_argument('--seed', type=int, default=1,
+                            help='Initial Random Seed')
+
+parser.add_argument('--optimizer', type=str, default='adam',
+                            help='optimizer')
+parser.add_argument('--momentum', type=float, default=0.9,
+                            help='Nesterov Momentum for SGD')
+
+
+parser.add_argument('--eval_qcm', type=bool, default=False,
+                            help='Eval or not QCM')
+
+parser.add_argument('--eval_coco', type=bool, default=False,
+                            help='Eval or not coco')
+
+parser.add_argument('--MSRVTT', type=bool, default=False,
+                            help='MSRVTT')
+
+parser.add_argument('--coco_sampling_rate', type=float, default=1.0,
+                            help='coco sampling rate')
+
+
+args = parser.parse_args()
+
+print args
+
+root_feat = 'data'
+
+mp_visual_path = os.path.join(root_feat,'X_resnet.npy')
+mp_flow_path = os.path.join(root_feat,'X_flow.npy')
+mp_face_path = os.path.join(root_feat,'X_face.npy')
+
+def verbose(epoch, status, metrics, name='TEST'):
+    print(name+' - epoch: %d, epoch status: %.2f, r@1: %.3f, r@5: %.3f, r@10: %.3f, mr: %d' % 
+            (epoch + 1, status, 
+                metrics['R1'], metrics['R5'], metrics['R10'],
+                metrics['MR']))
+
+
+def compute_metric(x):
+    sx = np.sort(-x, axis=1)
+    d = np.diag(-x)
+    d = d[:,np.newaxis]
+    ind = sx - d
+    ind = np.where(ind == 0)
+    ind = ind[1]
+
+    metrics = {}
+    metrics['R1'] = float(np.sum(ind == 0))/len(ind)
+    metrics['R5'] = float(np.sum(ind < 5))/len(ind)
+    metrics['R10'] = float(np.sum(ind < 10))/len(ind)
+    metrics['MR'] = np.median(ind) + 1
+
+    return metrics
+
+def make_tensor(l, max_len):
+    tensor = np.zeros((len(l),max_len,l[0].shape[-1]))
+    for i in range(len(l)):
+        if len(l[i]):
+            tensor[i,:min(max_len,l[i].shape[0]),:] = l[i][:min(max_len,l[i].shape[0])]
+
+    return th.from_numpy(tensor).float()
+
+# predefining random initial seeds
+th.manual_seed(args.seed)
+np.random.seed(args.seed)
+random.seed(args.seed)
+
+if args.eval_qcm and not(args.MSRVTT):
+    qcm_dataset = LD2.LSMDC_qcm(os.path.join(root_feat,'resnet-qcm.npy'),
+            os.path.join(root_feat,'w2v_LSMDC_qcm.npy'), os.path.join(root_feat,'X_audio_test.npy'),
+            os.path.join(root_feat,'flow-qcm.npy'),
+            os.path.join(root_feat,'face-qcm.npy')) 
+    
+    qcm_sampler = QCMSampler(len(qcm_dataset))
+    qcm_dataloader = DataLoader(qcm_dataset, batch_size=500, sampler=qcm_sampler, num_workers=1)
+    qcm_gt_fn = os.path.join(root_feat,'multiple_choice_gt.txt')
+    qcm_gt = [line.rstrip('\n') for line in open(qcm_gt_fn)]
+    qcm_gt = np.array(map(int,qcm_gt))
+
+print 'Pre-loading features ... This may takes several minutes ...'
+
+if args.MSRVTT:
+    visual_feat_path = os.path.join(root_feat,'resnet_features.pickle')  
+    flow_feat_path = os.path.join(root_feat,'flow_features.pickle')
+    text_feat_path = os.path.join(root_feat,'w2v_MSRVTT.pickle')
+    audio_feat_path = os.path.join(root_feat,'audio_features.pickle')
+    face_feat_path = os.path.join(root_feat,'face_features.pickle')
+    train_list_path = os.path.join(root_feat,'train_list.txt')
+    test_list_path = os.path.join(root_feat,'test_list.txt')
+
+    dataset = MSR.MSRVTT(visual_feat_path, flow_feat_path, text_feat_path,
+            audio_feat_path, face_feat_path, train_list_path,test_list_path, coco=args.coco) 
+    msr_sampler = MSRSampler(dataset.n_MSR,dataset.n_coco,args.coco_sampling_rate)
+    
+    if args.coco:
+        dataloader = DataLoader(dataset, batch_size=args.batch_size,
+                sampler=msr_sampler, num_workers=1,collate_fn=dataset.collate_data, drop_last=True)
+    else:
+        dataloader = DataLoader(dataset, batch_size=args.batch_size,
+                shuffle=True, num_workers=1,collate_fn=dataset.collate_data, drop_last=True)
+
+else:
+    path_to_text = os.path.join(root_feat,'w2v_LSMDC.npy')
+    path_to_audio = os.path.join(root_feat,'X_audio_train.npy')
+
+    dataset = LD2.LSMDC(mp_visual_path, path_to_text,
+            path_to_audio, mp_flow_path, mp_face_path, coco=args.coco) 
+    dataloader = DataLoader(dataset, batch_size=args.batch_size,
+            shuffle=True, num_workers=1, drop_last=True)
+    print 'Done.'
+
+    print 'Reading test data ...'
+    resnet_features_path = os.path.join(root_feat,'resnet152-retrieval.npy.tensor.npy')
+    flow_features_path = os.path.join(root_feat,'flow-retrieval.npy.tensor.npy')
+    face_features_path = os.path.join(root_feat,'face-retrieval.npy.tensor.npy')
+    text_features_path = os.path.join(root_feat,'w2v_LSMDC_retrieval.npy')
+    audio_features_path = os.path.join(root_feat,'X_audio_retrieval.npy.tensor.npy')
+
+    vid_retrieval = np.load(resnet_features_path)
+    flow_retrieval = np.load(flow_features_path)
+    face_retrieval = np.load(face_features_path)
+    text_retrieval = np.load(text_features_path)
+    audio_retrieval = np.load(audio_features_path)
+
+    mm = max(map(len,text_retrieval))
+
+    text_retrieval = make_tensor(text_retrieval,mm)
+
+    vid_retrieval = th.from_numpy(vid_retrieval).float()
+    flow_retrieval = th.from_numpy(flow_retrieval).float()
+    face_retrieval = th.from_numpy(face_retrieval).float()
+    audio_retrieval = th.from_numpy(audio_retrieval).float()
+
+    text_retrieval_val = text_retrieval
+    vid_retrieval_val = vid_retrieval
+    flow_retrieval_val = flow_retrieval
+    face_retrieval_val = face_retrieval
+    audio_retrieval_val = audio_retrieval
+
+
+    face_ind_test = np.load(os.path.join(root_feat,'no_face_ind_retrieval.npy'))
+    face_ind_test = 1 - face_ind_test
+print 'Done.'
+
+# Model
+video_modality_dim = {'face': (128,128), 'audio': (128*16,128),
+'visual': (2048,2048), 'motion': (1024,1024)}
+net = Net(video_modality_dim,300,
+        audio_cluster=16,text_cluster=args.text_cluster_size)
+net.train()
+
+if args.GPU:
+    net.cuda()
+
+# Optimizers + Loss
+max_margin = MaxMarginRankingLoss(margin=args.margin) 
+
+
+if args.optimizer == 'adam':
+    optimizer = optim.Adam(net.parameters(), lr=args.lr)
+elif args.optimizer == 'sgd':
+    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum)
+
+if args.GPU:
+    max_margin.cuda()
+
+n_display = args.n_display
+dataset_size = len(dataset)
+lr_decay = args.lr_decay
+
+print 'Starting training loop ...'
+print 'Parameters: lr: %f, epochs: %d, seed: %d, batch_size: %d'%(args.lr,
+        args.epochs, th.initial_seed(), args.batch_size)
+
+for epoch in range(args.epochs):
+    running_loss = 0.0
+    print 'epoch: %d'%epoch
+
+    for i_batch, sample_batched in enumerate(dataloader):
+
+        if args.MSRVTT:
+            captions = sample_batched['text']
+            audio = sample_batched['audio']
+        else:
+            captions = dataset.shorteningTextTensor(sample_batched['text'],
+                    sample_batched['text_size'])
+            
+            audio = dataset.shorteningTextTensor(sample_batched['audio'],
+                    sample_batched['audio_size'])
+       
+
+        face = sample_batched['face']
+        video = sample_batched['video']
+        flow = sample_batched['flow']
+        coco_ind = sample_batched['coco_ind']
+        face_ind = sample_batched['face_ind']
+
+        ind = {}
+        ind['face'] = face_ind
+        ind['visual'] = np.ones((len(face_ind)))
+        ind['motion'] = 1 - coco_ind 
+        ind['audio'] = 1 - coco_ind
+
+        if args.GPU:
+            captions, video = Variable(captions.cuda()), Variable(video.cuda())
+            audio, flow  =  Variable(audio.cuda()), Variable(flow.cuda())
+            face = Variable(face.cuda())
+
+
+        optimizer.zero_grad()
+        confusion_matrix = net(captions,
+                {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True)
+        loss = max_margin(confusion_matrix)
+        loss.backward()
+
+        optimizer.step()
+        running_loss += loss.data[0]
+        
+        if (i_batch+1) % n_display == 0:
+            print 'Epoch %d, Epoch status: %.2f, Training loss: %.4f'%(epoch + 1,
+                    args.batch_size*float(i_batch)/dataset_size,running_loss/n_display)
+            running_loss = 0.0
+
+    print 'evaluating epoch %d ...'%(epoch+1)
+    net.eval()  
+
+    if args.MSRVTT:
+        retrieval_samples = dataset.getRetrievalSamples()
+
+        video = Variable(retrieval_samples['video'].cuda(), volatile=True)
+        captions = Variable(retrieval_samples['text'].cuda(), volatile=True)
+        audio = Variable(retrieval_samples['audio'].cuda(), volatile=True)
+        flow = Variable(retrieval_samples['flow'].cuda(), volatile=True)
+        face = Variable(retrieval_samples['face'].cuda(), volatile=True)
+        face_ind = retrieval_samples['face_ind']
+
+        ind = {}
+        ind['face'] = face_ind
+        ind['visual'] = np.ones((len(face_ind)))
+        ind['motion'] = np.ones((len(face_ind)))
+        ind['audio'] = np.ones((len(face_ind)))
+
+        conf = net(captions,
+                {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True)
+        confusion_matrix = conf.data.cpu().float().numpy()
+        metrics = compute_metric(confusion_matrix)
+        verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MSRVTT')
+
+    else:
+        video = Variable(vid_retrieval_val.cuda(), volatile=True)
+        captions = Variable(text_retrieval_val.cuda(), volatile=True)
+        audio = Variable(audio_retrieval_val.cuda(), volatile=True)
+        flow = Variable(flow_retrieval_val.cuda(), volatile=True)
+        face = Variable(face_retrieval_val.cuda(), volatile=True)
+        
+        ind = {}
+        ind['face'] = face_ind_test
+        ind['visual'] = np.ones((len(face_ind_test)))
+        ind['motion'] = np.ones((len(face_ind_test)))
+        ind['audio'] = np.ones((len(face_ind_test)))
+
+        conf = net(captions,
+                {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, True)
+        confusion_matrix = conf.data.cpu().float().numpy()
+        metrics = compute_metric(confusion_matrix)
+        verbose(epoch, args.batch_size*float(i_batch)/dataset_size, metrics, name='MPII')
+        
+    net.train()
+
+    if args.eval_qcm:
+        print 'LSMDC Multiple-Choice evaluation (accuracy)'
+        net.eval()
+        scores = []
+
+        for i_batch, sample_batched in enumerate(qcm_dataloader):
+            captions = sample_batched['text']
+            
+            audio = qcm_dataset.shorteningTextTensor(sample_batched['audio'],
+                    sample_batched['audio_size'])
+     
+            video = sample_batched['video']
+            flow = sample_batched['flow']
+            face = sample_batched['face']
+            face_ind = sample_batched['face_ind']
+
+            ind = {}
+            ind['face'] = face_ind
+            ind['visual'] = np.ones((len(face_ind)))
+            ind['motion'] = np.ones((len(face_ind)))
+            ind['audio'] = np.ones((len(face_ind)))
+
+            if args.GPU:
+                captions, video = Variable(captions.cuda(), volatile=True), Variable(video.cuda(), volatile=True)
+                audio, flow  =  Variable(audio.cuda(), volatile=True), Variable(flow.cuda(), volatile=True)
+                face = Variable(face.cuda(), volatile=True)
+
+            s = net(captions, {'face': face, 'audio': audio, 'visual': video, 'motion': flow}, ind, False)
+            s = s.data.cpu().float().numpy()
+            scores.extend(s)
+
+        scores = np.array(scores)
+        scores = np.reshape(scores, (len(qcm_dataset),5))   
+        pred = np.argmax(scores, axis=1)+1
+     
+        accuracy_qcm = sum(pred == qcm_gt)/float(len(pred))
+        print 'Accuracy Multiple-Choice: %.3f'%accuracy_qcm
+        
+        net.train()
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] *= lr_decay
+
+
+
+
+