Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jasongief committed Apr 3, 2021
0 parents commit 40b0fa7
Show file tree
Hide file tree
Showing 10 changed files with 1,231 additions and 0 deletions.
25 changes: 25 additions & 0 deletions Optim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
'''A wrapper class for scheduled optimizer '''
import numpy as np

class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''

def __init__(self, optimizer):
self._optimizer = optimizer


def step_lr(self):
"Step with the inner optimizer"
self._optimizer.step()

def update_lr(self):
"Step with the inner optimizer"
self._update_learning_rate()

def _update_learning_rate(self):
''' Learning rate scheduling '''
for param_group in self._optimizer.param_groups:
# print('before', param_group['lr'])
lr = param_group['lr'] * 0.8
# print('after', param_group['lr'])
param_group['lr'] = lr
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
## Pytorch implementation for the CVPR-2021 paper: **Positive Sample Propagation along the Audio-Visual Event Line**

### paper link: [https://arxiv.org/abs/2104.00239](https://arxiv.org/abs/2104.00239)


## Audio-Visual Event (AVE) Localization task
---
AVE localization aims to find out those video segments containing an *audio-visual event* and classify its category.
An *audio-visual* event is both audible and visible, which means the sound source must appear in visual image (visible) while the sound it makes also exists in audio portion (audible).

![AVE localization](./figures/figure1_AVE_Localization.png)

## Our Framework
![framework](./figures/figure2_framework.png)

## Data preparation
---
The AVE dataset and the extracted audio and visual features can be downloaded from [https://github.com/YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18).
Other preprocessed files used in this repository can be downloaded from [here](https://drive.google.com/file/d/1juKwV813ZibgX79VDjB6X6Pnmq1X7Huz/view?usp=sharing).
All the data needed is listed as below, and these files should be included in the ``data`` folder.
<pre><code>
audio_feature.h5 visual_feature.h5 audio_feature_noisy.h5 visual_feature_noisy.h5
right_label.h5 prob_label.h5 labels_noisy.h5 mil_labels.h5
train_order.h5 val_order.h5 test_order.h5
</code></pre>


## Fully supervised setting
- Train:
> CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --train
- Test:
> CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --trained_model_path ./model/PSP_fully.pt
## Weakly supervised setting
- Train:
> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --train
- Test:
> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --trained_model_path ./model/PSP_weakly.pt
**Note:** The pre-trained model can be downloaded [here](https://drive.google.com/drive/folders/1YEyEH6e988v1NUwwVYohrwY2DSpVtVoT?usp=sharing) and it should be placed into the ``model`` folder. With the pre-trained model, the AVE localization accuracy can achieve 78.0% and 73.9% under the fully and weakly supervised settings, respectively. This result is slightly higher than that we reported in the arxiv paper (77.8% and 73.5%). If you would like to train from scratch for the both settings, you may adjust the threshold value or initialization method to further improve performance.



## Citation
----
If our paper is useful for your research, please consider citing it:
<pre><code>@InProceedings{zhou2021positive,
title={Positive Sample Propagation along the Audio-Visual Event Line},
author={Zhou, Jinxing and Zheng, Liang and Zhong, Yiran and Hao, Shijie and Wang, Meng},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2021},
}
</code></pre>


## Acknowledgements
This code began with [YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18). Thanks for their great work.
We also hope our source code can help people who are interested in our work or the audio-visual related problems.
If you have any questions about our paper or the codes, please feel free to open an issue or contact us by email.
204 changes: 204 additions & 0 deletions dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""AVE dataset"""
import numpy as np
import torch
import h5py
import pickle
import random
from itertools import product
import os
import pdb

ave_dataset = ['bell', 'Male', 'Bark', 'aircraft', 'car', 'Female', 'Helicopter',
'Violin', 'Flute', 'Ukulele', 'Fry food', 'Truck', 'Shofar', 'Motorcycle',
'guitar', 'Train', 'Clock', 'Banjo', 'Goat', 'Baby', 'Bus',
'Chainsaw', 'Cat', 'Horse', 'Toilet', 'Rodents', 'Accordion', 'Mandolin', 'background']
STANDARD_AVE_DATASET = ['Church bell', 'Male speech, man speaking', 'Bark', 'Fixed-wing aircraft, airplane', 'Race car, auto racing', \
'Female speech, woman speaking', 'Helicopter', 'Violin, fiddle', 'Flute', 'Ukulele', 'Frying (food)', 'Truck', 'Shofar', \
'Motorcycle', 'Acoustic guitar', 'Train horn', 'Clock', 'Banjo', 'Goat', 'Baby cry, infant cry', 'Bus', 'Chainsaw',\
'Cat', 'Horse', 'Toilet flush', 'Rodents, rats, mice', 'Accordion', 'Mandolin']

class AVEDataset(object):
"""Data preparation for fully supervised setting.
"""
def __init__(self, video_dir, audio_dir, label_dir, order_dir, batch_size, status):

self.video_dir = video_dir
self.audio_dir = audio_dir
self.batch_size = batch_size
self.status = status

with h5py.File(audio_dir, 'r') as hf:
self.audio_features = hf['avadataset'][:] # shape: (4143, 10, 128)
with h5py.File(label_dir, 'r') as hf:
self.labels = hf['avadataset'][:] # shape: (4143, 10, 29)
with h5py.File(video_dir, 'r') as hf:
self.video_features = hf['avadataset'][:] # shape: (4143, 10, 7, 7, 512)
print('>> visual feature: ', self.video_features.shape)
print('>> audio feature: ', self.audio_features.shape)

with h5py.File(order_dir, 'r') as hf:
order = hf['order'][:] # list, lenth=3339

self.lis = order.tolist() # the index of training samples.
self.list_copy = self.lis.copy().copy()

self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512]))
self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
self.pos_audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
self.label_batch = np.float32(np.zeros([self.batch_size, 10, 29]))
self.segment_label_batch = np.float32(np.zeros([self.batch_size, 10]))
self.segment_avps_gt_batch = np.float32(np.zeros([self.batch_size, 10]))

def get_segment_wise_relation(self, batch_labels):
# batch_labels: [bs, 10, 29]
bs, seg_num, category_num = batch_labels.shape
all_seg_idx = list(range(seg_num))
for i in range(bs):
col_sum = np.sum(batch_labels[i].T, axis=1)
category_bg_cols = col_sum.nonzero()[0].tolist()
category_bg_cols.sort() # [category_label_idx, 28(background_idx, optional)]

category_col_idx = category_bg_cols[0]
category_col = batch_labels[i, :, category_col_idx]
same_category_row_idx = category_col.nonzero()[0].tolist()
if len(same_category_row_idx) != 0:
self.segment_avps_gt_batch[i, same_category_row_idx] = 1 / (len(same_category_row_idx))

for i in range(bs):
row_idx, col_idx = np.where(batch_labels[i] == 1)
self.segment_label_batch[i, row_idx] = col_idx


def __len__(self):
return len(self.lis)


def get_batch(self, idx, shuffle_samples=False):
if shuffle_samples:
random.shuffle(self.list_copy)
select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size]

for i in range(self.batch_size):
id = select_ids[i]
v_id = id
self.video_batch[i, :, :, :, :] = self.video_features[v_id, :, :, :, :]
self.audio_batch[i, :, :] = self.audio_features[id, :, :]
self.label_batch[i, :, :] = self.labels[id, :, :]

self.get_segment_wise_relation(self.label_batch)


return torch.from_numpy(self.audio_batch).float(), \
torch.from_numpy(self.video_batch).float(), \
torch.from_numpy(self.label_batch).float(), \
torch.from_numpy(self.segment_label_batch).long(), \
torch.from_numpy(self.segment_avps_gt_batch).float(), \




class AVE_weak_Dataset(object):
"""Data preparation for weakly supervised setting.
"""
def __init__(self, video_dir, video_dir_bg, audio_dir, audio_dir_bg, label_dir, prob_label_dir, label_dir_bg, label_dir_gt, order_dir, batch_size, status='train'):
self.video_dir = video_dir
self.audio_dir = audio_dir
self.video_dir_bg = video_dir_bg
self.audio_dir_bg = audio_dir_bg
self.status = status
self.batch_size = batch_size
with h5py.File(order_dir, 'r') as hf:
train_l = hf['order'][:] # lenth: 3339, array
self.lis = train_l
self.list_copy = self.lis.copy().copy().tolist()

with h5py.File(audio_dir, 'r') as hf:
self.audio_features = hf['avadataset'][:] # (4143, 10, 128)
with h5py.File(label_dir, 'r') as hf:
self.labels = hf['avadataset'][:] # (4143, 29)
with h5py.File(prob_label_dir, 'r') as hf:
self.prob_labels = hf['avadataset'][:] # (4143, 29)
with h5py.File(video_dir, 'r') as hf:
self.video_features = hf['avadataset'][:] # (4143, 10, 7, 7, 512)
self.video_features = self.video_features[train_l, :, :]
print('video_features.shape', self.video_features.shape)

self.audio_features = self.audio_features[train_l, :, :] # 3339
self.labels = self.labels[train_l, :]
self.prob_labels = self.prob_labels[train_l, :]

if status == "train":
with h5py.File(label_dir_bg, 'r') as hf:
self.negative_labels = hf['avadataset'][:] # negative, shape (178, 29)
with h5py.File(audio_dir_bg, 'r') as hf:
self.negative_audio_features = hf['avadataset'][:] # shape:[178, 10, 128]
with h5py.File(video_dir_bg, 'r') as hf:
self.negative_video_features = hf['avadataset'][:] # shape: (178, 10, 7, 7, 512)
ng_num = self.negative_audio_features.shape[0]

size = self.audio_features.shape[0] + self.negative_audio_features.shape[0]
audio_train_new = np.zeros((size, self.audio_features.shape[1], self.audio_features.shape[2]))
audio_train_new[0:self.audio_features.shape[0], :, :] = self.audio_features
audio_train_new[self.audio_features.shape[0]:size, :, :] = self.negative_audio_features
self.audio_features = audio_train_new

video_train_new = np.zeros((size, 10, 7, 7, 512))
video_train_new[0:self.video_features.shape[0], :, :] = self.video_features
video_train_new[self.video_features.shape[0]:size, :, :] = self.negative_video_features
self.video_features = video_train_new

y_train_new = np.zeros((size, 29))
y_train_new[0:self.labels.shape[0], :] = self.labels
y_train_new[self.labels.shape[0]:size, :] = self.negative_labels
self.labels = y_train_new

prob_y_train_new = np.zeros((size, 29))
prob_y_train_new[0:self.prob_labels.shape[0], :] = self.prob_labels
prob_y_train_new[self.prob_labels.shape[0]:size, :] = self.negative_labels
self.prob_labels = prob_y_train_new
self.list_copy.extend(list(range(8000, 8000+ng_num, 1)))
else: # testing, label for each video segment is known
with h5py.File(label_dir_gt, 'r') as hf:
self.labels = hf['avadataset'][:]
self.labels = self.labels[train_l, :, :]

self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512]))
self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128]))
if status == "train":
self.label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag.
self.prob_label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag.
else:
self.label_batch = np.float32(np.zeros([self.batch_size,10, 29])) # during testing, segment label should be predicted.

def __len__(self):
return len(self.labels)

def get_batch(self, idx, shuffle_samples=False):
self.list_copy_copy = self.list_copy.copy().copy()
if shuffle_samples:
random.shuffle(self.list_copy)
select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size]

for i in range(self.batch_size):
id = select_ids[i]
real_id = self.list_copy_copy.index(id)
self.video_batch[i, :, :, :, :] = self.video_features[real_id, :, :, :, :] # [10, 7, 7, 512]
self.audio_batch[i, :, :] = self.audio_features[real_id, :, :] #[10, 128]
if self.status == "train":
self.label_batch[i, :] = self.labels[real_id, :] # [1, 29] one-hot
self.prob_label_batch[i, :] = self.prob_labels[real_id, :] # [1, 29] normalized label
else:
self.label_batch[i, :, :] = self.labels[real_id, :, :]


if self.status == 'train':
return torch.from_numpy(self.audio_batch).float(), \
torch.from_numpy(self.video_batch).float(), \
torch.from_numpy(self.label_batch).float(), \
torch.from_numpy(self.prob_label_batch).float()
else:
return torch.from_numpy(self.audio_batch).float(), \
torch.from_numpy(self.video_batch).float(), \
torch.from_numpy(self.label_batch).float()


Binary file added figures/figure1_AVE_Localization.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/figure2_framework.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 40b0fa7

Please sign in to comment.