init commit

jasongief · Apr 3, 2021 · 40b0fa7 · 40b0fa7
commit 40b0fa7
Show file tree

Hide file tree

Showing 10 changed files with 1,231 additions and 0 deletions.
diff --git a/Optim.py b/Optim.py
@@ -0,0 +1,25 @@
+'''A wrapper class for scheduled optimizer '''
+import numpy as np
+
+class ScheduledOptim():
+    '''A simple wrapper class for learning rate scheduling'''
+
+    def __init__(self, optimizer):
+        self._optimizer = optimizer
+
+
+    def step_lr(self):
+        "Step with the inner optimizer"
+        self._optimizer.step()
+
+    def update_lr(self):
+        "Step with the inner optimizer"
+        self._update_learning_rate()
+
+    def _update_learning_rate(self):
+        ''' Learning rate scheduling '''
+        for param_group in self._optimizer.param_groups:
+            # print('before', param_group['lr'])
+            lr = param_group['lr'] * 0.8
+            # print('after', param_group['lr'])
+            param_group['lr'] = lr
diff --git a/README.md b/README.md
@@ -0,0 +1,59 @@
+## Pytorch implementation for the CVPR-2021 paper: **Positive Sample Propagation along the Audio-Visual Event Line**
+
+### paper link: [https://arxiv.org/abs/2104.00239](https://arxiv.org/abs/2104.00239)
+
+
+## Audio-Visual Event (AVE) Localization task
+---
+AVE localization aims to find out those video segments containing an *audio-visual event* and classify its category.
+An *audio-visual* event is both audible and visible, which means the sound source must appear in visual image (visible) while the sound it makes also exists in audio portion (audible).
+
+![AVE localization](./figures/figure1_AVE_Localization.png)
+
+## Our Framework
+![framework](./figures/figure2_framework.png)
+
+## Data preparation
+---
+The AVE dataset and the extracted audio and visual features can be downloaded from [https://github.com/YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18).
+Other preprocessed files used in this repository can be downloaded from [here](https://drive.google.com/file/d/1juKwV813ZibgX79VDjB6X6Pnmq1X7Huz/view?usp=sharing).
+All the data needed is listed as below, and these files should be included in the ``data`` folder.
+<pre><code>
+audio_feature.h5  visual_feature.h5  audio_feature_noisy.h5 visual_feature_noisy.h5
+right_label.h5  prob_label.h5  labels_noisy.h5  mil_labels.h5
+train_order.h5  val_order.h5  test_order.h5
+</code></pre>
+
+
+## Fully supervised setting
+- Train:
+>  CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --train
+- Test:
+>  CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --trained_model_path ./model/PSP_fully.pt
+
+## Weakly supervised setting
+- Train:
+> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --train
+- Test:
+> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --trained_model_path ./model/PSP_weakly.pt
+
+**Note:** The pre-trained model can be downloaded [here](https://drive.google.com/drive/folders/1YEyEH6e988v1NUwwVYohrwY2DSpVtVoT?usp=sharing) and it should be placed into the ``model`` folder. With the pre-trained model, the AVE localization accuracy can achieve 78.0% and 73.9% under the fully and weakly supervised settings, respectively. This result is slightly higher than that we reported in the arxiv paper (77.8% and 73.5%). If you would like to train from scratch for the both settings, you may adjust the threshold value or initialization method to further improve performance.
+
+
+
+## Citation
+----
+If our paper is useful for your research, please consider citing it:
+<pre><code>@InProceedings{zhou2021positive,
+    title={Positive Sample Propagation along the Audio-Visual Event Line},
+    author={Zhou, Jinxing and Zheng, Liang and Zhong, Yiran and Hao, Shijie and Wang, Meng},
+    booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+    year={2021},
+}
+</code></pre>
+
+
+## Acknowledgements
+This code began with [YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18). Thanks for their great work.
+We also hope our source code can help people who are interested in our work or the audio-visual related problems.
+If you have any questions about our paper or the codes, please feel free to open an issue or contact us by email.
diff --git a/dataloader.py b/dataloader.py
@@ -0,0 +1,204 @@
+"""AVE dataset"""
+import numpy as np
+import torch
+import h5py
+import pickle
+import random
+from itertools import product
+import os
+import pdb
+
+ave_dataset = ['bell', 'Male', 'Bark', 'aircraft', 'car', 'Female', 'Helicopter',
+    'Violin', 'Flute', 'Ukulele', 'Fry food', 'Truck', 'Shofar', 'Motorcycle',
+    'guitar', 'Train', 'Clock', 'Banjo', 'Goat', 'Baby', 'Bus',
+    'Chainsaw', 'Cat', 'Horse', 'Toilet', 'Rodents', 'Accordion', 'Mandolin', 'background']
+STANDARD_AVE_DATASET = ['Church bell', 'Male speech, man speaking', 'Bark', 'Fixed-wing aircraft, airplane', 'Race car, auto racing', \
+                    'Female speech, woman speaking', 'Helicopter', 'Violin, fiddle', 'Flute', 'Ukulele', 'Frying (food)', 'Truck', 'Shofar', \
+                    'Motorcycle', 'Acoustic guitar', 'Train horn', 'Clock', 'Banjo', 'Goat', 'Baby cry, infant cry', 'Bus', 'Chainsaw',\
+                    'Cat', 'Horse', 'Toilet flush', 'Rodents, rats, mice', 'Accordion', 'Mandolin']
+
+class AVEDataset(object):
+    """Data preparation for fully supervised setting.
+    """
+    def __init__(self, video_dir, audio_dir, label_dir, order_dir, batch_size, status):
+
+        self.video_dir = video_dir
+        self.audio_dir = audio_dir
+        self.batch_size = batch_size
+        self.status = status
+
+        with h5py.File(audio_dir, 'r') as hf:
+            self.audio_features = hf['avadataset'][:] # shape: (4143, 10, 128)
+        with h5py.File(label_dir, 'r') as hf:
+            self.labels = hf['avadataset'][:] # shape: (4143, 10, 29)
+        with h5py.File(video_dir, 'r') as hf:
+            self.video_features = hf['avadataset'][:] # shape: (4143, 10, 7, 7, 512)
+        print('>> visual feature: ', self.video_features.shape)
+        print('>> audio feature: ', self.audio_features.shape)
+
+        with h5py.File(order_dir, 'r') as hf:
+            order = hf['order'][:] # list, lenth=3339
+
+        self.lis = order.tolist() # the index of training samples.
+        self.list_copy = self.lis.copy().copy()
+
+        self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512]))
+        self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
+        self.pos_audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
+        self.label_batch = np.float32(np.zeros([self.batch_size, 10, 29]))
+        self.segment_label_batch = np.float32(np.zeros([self.batch_size, 10]))
+        self.segment_avps_gt_batch = np.float32(np.zeros([self.batch_size, 10]))
+
+    def get_segment_wise_relation(self, batch_labels):
+        # batch_labels: [bs, 10, 29]
+        bs, seg_num, category_num = batch_labels.shape
+        all_seg_idx = list(range(seg_num))
+        for i in range(bs):
+            col_sum = np.sum(batch_labels[i].T, axis=1)
+            category_bg_cols = col_sum.nonzero()[0].tolist()
+            category_bg_cols.sort() # [category_label_idx, 28(background_idx, optional)]
+
+            category_col_idx = category_bg_cols[0]
+            category_col = batch_labels[i, :, category_col_idx]
+            same_category_row_idx = category_col.nonzero()[0].tolist()
+            if len(same_category_row_idx) != 0:
+                self.segment_avps_gt_batch[i, same_category_row_idx] = 1 / (len(same_category_row_idx))
+
+        for i in range(bs):
+            row_idx, col_idx = np.where(batch_labels[i] == 1)
+            self.segment_label_batch[i, row_idx] = col_idx
+
+
+    def __len__(self):
+        return len(self.lis)
+
+
+    def get_batch(self, idx, shuffle_samples=False):
+        if shuffle_samples:
+            random.shuffle(self.list_copy)
+        select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size]
+
+        for i in range(self.batch_size):
+            id = select_ids[i]
+            v_id = id
+            self.video_batch[i, :, :, :, :] = self.video_features[v_id, :, :, :, :]
+            self.audio_batch[i, :, :] = self.audio_features[id, :, :]
+            self.label_batch[i, :, :] = self.labels[id, :, :]
+
+        self.get_segment_wise_relation(self.label_batch)
+
+
+        return torch.from_numpy(self.audio_batch).float(), \
+                torch.from_numpy(self.video_batch).float(), \
+                torch.from_numpy(self.label_batch).float(), \
+                torch.from_numpy(self.segment_label_batch).long(), \
+                torch.from_numpy(self.segment_avps_gt_batch).float(), \
+
+
+
+
+class AVE_weak_Dataset(object):
+    """Data preparation for weakly supervised setting.
+    """
+    def __init__(self, video_dir, video_dir_bg, audio_dir, audio_dir_bg, label_dir, prob_label_dir, label_dir_bg, label_dir_gt, order_dir, batch_size, status='train'):
+        self.video_dir = video_dir
+        self.audio_dir = audio_dir
+        self.video_dir_bg = video_dir_bg
+        self.audio_dir_bg = audio_dir_bg
+        self.status = status
+        self.batch_size = batch_size
+        with h5py.File(order_dir, 'r') as hf:
+            train_l = hf['order'][:] # lenth: 3339, array
+        self.lis = train_l
+        self.list_copy = self.lis.copy().copy().tolist()
+
+        with h5py.File(audio_dir, 'r') as hf:
+            self.audio_features = hf['avadataset'][:] # (4143, 10, 128)
+        with h5py.File(label_dir, 'r') as hf:
+            self.labels = hf['avadataset'][:] # (4143, 29)
+        with h5py.File(prob_label_dir, 'r') as hf:
+            self.prob_labels = hf['avadataset'][:] # (4143, 29)
+        with h5py.File(video_dir, 'r') as hf:
+            self.video_features = hf['avadataset'][:] # (4143, 10, 7, 7, 512)
+            self.video_features = self.video_features[train_l, :, :]
+        print('video_features.shape', self.video_features.shape)
+
+        self.audio_features = self.audio_features[train_l, :, :] # 3339
+        self.labels = self.labels[train_l, :]
+        self.prob_labels = self.prob_labels[train_l, :]
+
+        if status == "train":
+            with h5py.File(label_dir_bg, 'r') as hf:
+                self.negative_labels = hf['avadataset'][:] # negative, shape (178, 29)
+            with h5py.File(audio_dir_bg, 'r') as hf:
+                self.negative_audio_features = hf['avadataset'][:] # shape:[178, 10, 128]
+            with h5py.File(video_dir_bg, 'r') as hf:
+                self.negative_video_features = hf['avadataset'][:] # shape: (178, 10, 7, 7, 512)
+            ng_num = self.negative_audio_features.shape[0]
+
+            size = self.audio_features.shape[0] + self.negative_audio_features.shape[0]
+            audio_train_new = np.zeros((size, self.audio_features.shape[1], self.audio_features.shape[2]))
+            audio_train_new[0:self.audio_features.shape[0], :, :] = self.audio_features
+            audio_train_new[self.audio_features.shape[0]:size, :, :] = self.negative_audio_features
+            self.audio_features = audio_train_new
+
+            video_train_new = np.zeros((size, 10, 7, 7, 512))
+            video_train_new[0:self.video_features.shape[0], :, :] = self.video_features
+            video_train_new[self.video_features.shape[0]:size, :, :] = self.negative_video_features
+            self.video_features = video_train_new
+
+            y_train_new = np.zeros((size, 29))
+            y_train_new[0:self.labels.shape[0], :] = self.labels
+            y_train_new[self.labels.shape[0]:size, :] = self.negative_labels
+            self.labels = y_train_new
+
+            prob_y_train_new = np.zeros((size, 29))
+            prob_y_train_new[0:self.prob_labels.shape[0], :] = self.prob_labels
+            prob_y_train_new[self.prob_labels.shape[0]:size, :] = self.negative_labels
+            self.prob_labels = prob_y_train_new
+            self.list_copy.extend(list(range(8000, 8000+ng_num, 1)))
+        else: # testing, label for each video segment is known
+            with h5py.File(label_dir_gt, 'r') as hf:
+                self.labels = hf['avadataset'][:]
+                self.labels = self.labels[train_l, :, :]
+
+        self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512]))
+        self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
+        if status == "train":
+            self.label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag.
+            self.prob_label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag.
+        else:
+            self.label_batch = np.float32(np.zeros([self.batch_size,10, 29])) # during testing, segment label should be predicted.
+
+    def __len__(self):
+        return len(self.labels)
+
+    def get_batch(self, idx, shuffle_samples=False):
+        self.list_copy_copy = self.list_copy.copy().copy()
+        if shuffle_samples:
+            random.shuffle(self.list_copy)
+        select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size]
+
+        for i in range(self.batch_size):
+            id = select_ids[i]
+            real_id = self.list_copy_copy.index(id)
+            self.video_batch[i, :, :, :, :] = self.video_features[real_id, :, :, :, :] # [10, 7, 7, 512]
+            self.audio_batch[i, :, :] = self.audio_features[real_id, :, :] #[10, 128]
+            if self.status == "train":
+                self.label_batch[i, :] = self.labels[real_id, :] # [1, 29] one-hot
+                self.prob_label_batch[i, :] = self.prob_labels[real_id, :] # [1, 29] normalized label
+            else:
+                self.label_batch[i, :, :] = self.labels[real_id, :, :]
+
+
+        if self.status == 'train':
+            return torch.from_numpy(self.audio_batch).float(), \
+                    torch.from_numpy(self.video_batch).float(), \
+                    torch.from_numpy(self.label_batch).float(), \
+                    torch.from_numpy(self.prob_label_batch).float()
+        else:
+            return torch.from_numpy(self.audio_batch).float(), \
+                torch.from_numpy(self.video_batch).float(), \
+                torch.from_numpy(self.label_batch).float()
+
+
diff --git a/figures/figure1_AVE_Localization.png b/figures/figure1_AVE_Localization.png
diff --git a/figures/figure2_framework.png b/figures/figure2_framework.png