-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 40b0fa7
Showing
10 changed files
with
1,231 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
'''A wrapper class for scheduled optimizer ''' | ||
import numpy as np | ||
|
||
class ScheduledOptim(): | ||
'''A simple wrapper class for learning rate scheduling''' | ||
|
||
def __init__(self, optimizer): | ||
self._optimizer = optimizer | ||
|
||
|
||
def step_lr(self): | ||
"Step with the inner optimizer" | ||
self._optimizer.step() | ||
|
||
def update_lr(self): | ||
"Step with the inner optimizer" | ||
self._update_learning_rate() | ||
|
||
def _update_learning_rate(self): | ||
''' Learning rate scheduling ''' | ||
for param_group in self._optimizer.param_groups: | ||
# print('before', param_group['lr']) | ||
lr = param_group['lr'] * 0.8 | ||
# print('after', param_group['lr']) | ||
param_group['lr'] = lr |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
## Pytorch implementation for the CVPR-2021 paper: **Positive Sample Propagation along the Audio-Visual Event Line** | ||
|
||
### paper link: [https://arxiv.org/abs/2104.00239](https://arxiv.org/abs/2104.00239) | ||
|
||
|
||
## Audio-Visual Event (AVE) Localization task | ||
--- | ||
AVE localization aims to find out those video segments containing an *audio-visual event* and classify its category. | ||
An *audio-visual* event is both audible and visible, which means the sound source must appear in visual image (visible) while the sound it makes also exists in audio portion (audible). | ||
|
||
 | ||
|
||
## Our Framework | ||
 | ||
|
||
## Data preparation | ||
--- | ||
The AVE dataset and the extracted audio and visual features can be downloaded from [https://github.com/YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18). | ||
Other preprocessed files used in this repository can be downloaded from [here](https://drive.google.com/file/d/1juKwV813ZibgX79VDjB6X6Pnmq1X7Huz/view?usp=sharing). | ||
All the data needed is listed as below, and these files should be included in the ``data`` folder. | ||
<pre><code> | ||
audio_feature.h5 visual_feature.h5 audio_feature_noisy.h5 visual_feature_noisy.h5 | ||
right_label.h5 prob_label.h5 labels_noisy.h5 mil_labels.h5 | ||
train_order.h5 val_order.h5 test_order.h5 | ||
</code></pre> | ||
|
||
|
||
## Fully supervised setting | ||
- Train: | ||
> CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --train | ||
- Test: | ||
> CUDA_VISIBLE_DEVICES=0 python fully_supervised_main.py --model_name PSP --threshold=0.099 --trained_model_path ./model/PSP_fully.pt | ||
## Weakly supervised setting | ||
- Train: | ||
> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --train | ||
- Test: | ||
> CUDA_VISIBLE_DEVICES=0 python weakly_supervised_main.py --model_name PSP --threshold=0.095 --trained_model_path ./model/PSP_weakly.pt | ||
**Note:** The pre-trained model can be downloaded [here](https://drive.google.com/drive/folders/1YEyEH6e988v1NUwwVYohrwY2DSpVtVoT?usp=sharing) and it should be placed into the ``model`` folder. With the pre-trained model, the AVE localization accuracy can achieve 78.0% and 73.9% under the fully and weakly supervised settings, respectively. This result is slightly higher than that we reported in the arxiv paper (77.8% and 73.5%). If you would like to train from scratch for the both settings, you may adjust the threshold value or initialization method to further improve performance. | ||
|
||
|
||
|
||
## Citation | ||
---- | ||
If our paper is useful for your research, please consider citing it: | ||
<pre><code>@InProceedings{zhou2021positive, | ||
title={Positive Sample Propagation along the Audio-Visual Event Line}, | ||
author={Zhou, Jinxing and Zheng, Liang and Zhong, Yiran and Hao, Shijie and Wang, Meng}, | ||
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, | ||
year={2021}, | ||
} | ||
</code></pre> | ||
|
||
|
||
## Acknowledgements | ||
This code began with [YapengTian/AVE-ECCV18](https://github.com/YapengTian/AVE-ECCV18). Thanks for their great work. | ||
We also hope our source code can help people who are interested in our work or the audio-visual related problems. | ||
If you have any questions about our paper or the codes, please feel free to open an issue or contact us by email. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
"""AVE dataset""" | ||
import numpy as np | ||
import torch | ||
import h5py | ||
import pickle | ||
import random | ||
from itertools import product | ||
import os | ||
import pdb | ||
|
||
ave_dataset = ['bell', 'Male', 'Bark', 'aircraft', 'car', 'Female', 'Helicopter', | ||
'Violin', 'Flute', 'Ukulele', 'Fry food', 'Truck', 'Shofar', 'Motorcycle', | ||
'guitar', 'Train', 'Clock', 'Banjo', 'Goat', 'Baby', 'Bus', | ||
'Chainsaw', 'Cat', 'Horse', 'Toilet', 'Rodents', 'Accordion', 'Mandolin', 'background'] | ||
STANDARD_AVE_DATASET = ['Church bell', 'Male speech, man speaking', 'Bark', 'Fixed-wing aircraft, airplane', 'Race car, auto racing', \ | ||
'Female speech, woman speaking', 'Helicopter', 'Violin, fiddle', 'Flute', 'Ukulele', 'Frying (food)', 'Truck', 'Shofar', \ | ||
'Motorcycle', 'Acoustic guitar', 'Train horn', 'Clock', 'Banjo', 'Goat', 'Baby cry, infant cry', 'Bus', 'Chainsaw',\ | ||
'Cat', 'Horse', 'Toilet flush', 'Rodents, rats, mice', 'Accordion', 'Mandolin'] | ||
|
||
class AVEDataset(object): | ||
"""Data preparation for fully supervised setting. | ||
""" | ||
def __init__(self, video_dir, audio_dir, label_dir, order_dir, batch_size, status): | ||
|
||
self.video_dir = video_dir | ||
self.audio_dir = audio_dir | ||
self.batch_size = batch_size | ||
self.status = status | ||
|
||
with h5py.File(audio_dir, 'r') as hf: | ||
self.audio_features = hf['avadataset'][:] # shape: (4143, 10, 128) | ||
with h5py.File(label_dir, 'r') as hf: | ||
self.labels = hf['avadataset'][:] # shape: (4143, 10, 29) | ||
with h5py.File(video_dir, 'r') as hf: | ||
self.video_features = hf['avadataset'][:] # shape: (4143, 10, 7, 7, 512) | ||
print('>> visual feature: ', self.video_features.shape) | ||
print('>> audio feature: ', self.audio_features.shape) | ||
|
||
with h5py.File(order_dir, 'r') as hf: | ||
order = hf['order'][:] # list, lenth=3339 | ||
|
||
self.lis = order.tolist() # the index of training samples. | ||
self.list_copy = self.lis.copy().copy() | ||
|
||
self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512])) | ||
self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128])) | ||
self.pos_audio_batch = np.float32(np.zeros([self.batch_size, 10, 128])) | ||
self.label_batch = np.float32(np.zeros([self.batch_size, 10, 29])) | ||
self.segment_label_batch = np.float32(np.zeros([self.batch_size, 10])) | ||
self.segment_avps_gt_batch = np.float32(np.zeros([self.batch_size, 10])) | ||
|
||
def get_segment_wise_relation(self, batch_labels): | ||
# batch_labels: [bs, 10, 29] | ||
bs, seg_num, category_num = batch_labels.shape | ||
all_seg_idx = list(range(seg_num)) | ||
for i in range(bs): | ||
col_sum = np.sum(batch_labels[i].T, axis=1) | ||
category_bg_cols = col_sum.nonzero()[0].tolist() | ||
category_bg_cols.sort() # [category_label_idx, 28(background_idx, optional)] | ||
|
||
category_col_idx = category_bg_cols[0] | ||
category_col = batch_labels[i, :, category_col_idx] | ||
same_category_row_idx = category_col.nonzero()[0].tolist() | ||
if len(same_category_row_idx) != 0: | ||
self.segment_avps_gt_batch[i, same_category_row_idx] = 1 / (len(same_category_row_idx)) | ||
|
||
for i in range(bs): | ||
row_idx, col_idx = np.where(batch_labels[i] == 1) | ||
self.segment_label_batch[i, row_idx] = col_idx | ||
|
||
|
||
def __len__(self): | ||
return len(self.lis) | ||
|
||
|
||
def get_batch(self, idx, shuffle_samples=False): | ||
if shuffle_samples: | ||
random.shuffle(self.list_copy) | ||
select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size] | ||
|
||
for i in range(self.batch_size): | ||
id = select_ids[i] | ||
v_id = id | ||
self.video_batch[i, :, :, :, :] = self.video_features[v_id, :, :, :, :] | ||
self.audio_batch[i, :, :] = self.audio_features[id, :, :] | ||
self.label_batch[i, :, :] = self.labels[id, :, :] | ||
|
||
self.get_segment_wise_relation(self.label_batch) | ||
|
||
|
||
return torch.from_numpy(self.audio_batch).float(), \ | ||
torch.from_numpy(self.video_batch).float(), \ | ||
torch.from_numpy(self.label_batch).float(), \ | ||
torch.from_numpy(self.segment_label_batch).long(), \ | ||
torch.from_numpy(self.segment_avps_gt_batch).float(), \ | ||
|
||
|
||
|
||
|
||
class AVE_weak_Dataset(object): | ||
"""Data preparation for weakly supervised setting. | ||
""" | ||
def __init__(self, video_dir, video_dir_bg, audio_dir, audio_dir_bg, label_dir, prob_label_dir, label_dir_bg, label_dir_gt, order_dir, batch_size, status='train'): | ||
self.video_dir = video_dir | ||
self.audio_dir = audio_dir | ||
self.video_dir_bg = video_dir_bg | ||
self.audio_dir_bg = audio_dir_bg | ||
self.status = status | ||
self.batch_size = batch_size | ||
with h5py.File(order_dir, 'r') as hf: | ||
train_l = hf['order'][:] # lenth: 3339, array | ||
self.lis = train_l | ||
self.list_copy = self.lis.copy().copy().tolist() | ||
|
||
with h5py.File(audio_dir, 'r') as hf: | ||
self.audio_features = hf['avadataset'][:] # (4143, 10, 128) | ||
with h5py.File(label_dir, 'r') as hf: | ||
self.labels = hf['avadataset'][:] # (4143, 29) | ||
with h5py.File(prob_label_dir, 'r') as hf: | ||
self.prob_labels = hf['avadataset'][:] # (4143, 29) | ||
with h5py.File(video_dir, 'r') as hf: | ||
self.video_features = hf['avadataset'][:] # (4143, 10, 7, 7, 512) | ||
self.video_features = self.video_features[train_l, :, :] | ||
print('video_features.shape', self.video_features.shape) | ||
|
||
self.audio_features = self.audio_features[train_l, :, :] # 3339 | ||
self.labels = self.labels[train_l, :] | ||
self.prob_labels = self.prob_labels[train_l, :] | ||
|
||
if status == "train": | ||
with h5py.File(label_dir_bg, 'r') as hf: | ||
self.negative_labels = hf['avadataset'][:] # negative, shape (178, 29) | ||
with h5py.File(audio_dir_bg, 'r') as hf: | ||
self.negative_audio_features = hf['avadataset'][:] # shape:[178, 10, 128] | ||
with h5py.File(video_dir_bg, 'r') as hf: | ||
self.negative_video_features = hf['avadataset'][:] # shape: (178, 10, 7, 7, 512) | ||
ng_num = self.negative_audio_features.shape[0] | ||
|
||
size = self.audio_features.shape[0] + self.negative_audio_features.shape[0] | ||
audio_train_new = np.zeros((size, self.audio_features.shape[1], self.audio_features.shape[2])) | ||
audio_train_new[0:self.audio_features.shape[0], :, :] = self.audio_features | ||
audio_train_new[self.audio_features.shape[0]:size, :, :] = self.negative_audio_features | ||
self.audio_features = audio_train_new | ||
|
||
video_train_new = np.zeros((size, 10, 7, 7, 512)) | ||
video_train_new[0:self.video_features.shape[0], :, :] = self.video_features | ||
video_train_new[self.video_features.shape[0]:size, :, :] = self.negative_video_features | ||
self.video_features = video_train_new | ||
|
||
y_train_new = np.zeros((size, 29)) | ||
y_train_new[0:self.labels.shape[0], :] = self.labels | ||
y_train_new[self.labels.shape[0]:size, :] = self.negative_labels | ||
self.labels = y_train_new | ||
|
||
prob_y_train_new = np.zeros((size, 29)) | ||
prob_y_train_new[0:self.prob_labels.shape[0], :] = self.prob_labels | ||
prob_y_train_new[self.prob_labels.shape[0]:size, :] = self.negative_labels | ||
self.prob_labels = prob_y_train_new | ||
self.list_copy.extend(list(range(8000, 8000+ng_num, 1))) | ||
else: # testing, label for each video segment is known | ||
with h5py.File(label_dir_gt, 'r') as hf: | ||
self.labels = hf['avadataset'][:] | ||
self.labels = self.labels[train_l, :, :] | ||
|
||
self.video_batch = np.float32(np.zeros([self.batch_size, 10, 7, 7, 512])) | ||
self.audio_batch = np.float32(np.zeros([self.batch_size, 10, 128])) | ||
if status == "train": | ||
self.label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag. | ||
self.prob_label_batch = np.float32(np.zeros([self.batch_size, 29])) # weak supervised, only have access to the event level tag. | ||
else: | ||
self.label_batch = np.float32(np.zeros([self.batch_size,10, 29])) # during testing, segment label should be predicted. | ||
|
||
def __len__(self): | ||
return len(self.labels) | ||
|
||
def get_batch(self, idx, shuffle_samples=False): | ||
self.list_copy_copy = self.list_copy.copy().copy() | ||
if shuffle_samples: | ||
random.shuffle(self.list_copy) | ||
select_ids = self.list_copy[idx * self.batch_size : (idx + 1) * self.batch_size] | ||
|
||
for i in range(self.batch_size): | ||
id = select_ids[i] | ||
real_id = self.list_copy_copy.index(id) | ||
self.video_batch[i, :, :, :, :] = self.video_features[real_id, :, :, :, :] # [10, 7, 7, 512] | ||
self.audio_batch[i, :, :] = self.audio_features[real_id, :, :] #[10, 128] | ||
if self.status == "train": | ||
self.label_batch[i, :] = self.labels[real_id, :] # [1, 29] one-hot | ||
self.prob_label_batch[i, :] = self.prob_labels[real_id, :] # [1, 29] normalized label | ||
else: | ||
self.label_batch[i, :, :] = self.labels[real_id, :, :] | ||
|
||
|
||
if self.status == 'train': | ||
return torch.from_numpy(self.audio_batch).float(), \ | ||
torch.from_numpy(self.video_batch).float(), \ | ||
torch.from_numpy(self.label_batch).float(), \ | ||
torch.from_numpy(self.prob_label_batch).float() | ||
else: | ||
return torch.from_numpy(self.audio_batch).float(), \ | ||
torch.from_numpy(self.video_batch).float(), \ | ||
torch.from_numpy(self.label_batch).float() | ||
|
||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.