-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_dataset_from_youtube.py
99 lines (72 loc) · 3.58 KB
/
get_dataset_from_youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import csv, sys
import os
import wave
import contextlib
import argparse
project_path = os.path.dirname(os.path.abspath(__file__))
balanced_train_segments = project_path+'/segments/subset_balanced_train_segments.csv'
unbalanced_train_segments = project_path+'/segments/2000max_subset_unbalanced_train_segments.csv'
eval_segments = project_path+'/segments/subset_eval_segments.csv'
data_path = project_path+'/data/'
# idea from : https://github.com/lccambiaghi/02456-Project---Background-Audio-Classification
# specify the index of files that is downloaded last time (to resume downloading)
last_processed_row = 0
def youtube_downloader(data_path ,subfolder, id, start_time, idx):
ret = os.system('ffmpeg -n $(youtube-dl --get-url \'https://www.youtube.com/watch?v=' + id + '\' '
'-f \'(mp4)[height<480]\' | sed "s/.*/-ss '+start_time+' -i &/") '
'-t 00:10 -c copy ' + data_path + subfolder +'video/'+ idx + '_' + id + '.mp4')
## save audio
if ret == 0:
os.system('ffmpeg -i '+ data_path + subfolder +'video/'+ idx + '_' + id + '.mp4 -vn -acodec copy '+ data_path + subfolder +'audio/'+ idx + '_' + id + '.aac')
return ret
def create_error_file(path,id, idx):
with open(path + "errors.txt", "a") as myfile:
myfile.write(idx + '_' + id +"\n")
def download_data(segments,subfolder):
rownum = 0
if not os.path.exists(data_path+subfolder+'video/'):
os.makedirs(data_path+subfolder+'video/')
if not os.path.exists(data_path+subfolder+'audio/'):
os.makedirs(data_path+subfolder+'audio/')
with open(segments, newline='') as f:
reader = csv.reader(f)
try:
for row in reader:
if rownum <= last_processed_row:
rownum += 1
continue
# Skip the 3 line header
if rownum >= 3:
print(row)
if (os.path.exists(data_path + subfolder +'video/'+ str(rownum - 3) + '_' + row[0] + '.mp4')):
print("file exists, skipping...")
rownum += 1
continue
ret = youtube_downloader(data_path,subfolder,row[0], str(float(row[1].lstrip())),str(rownum - 3))
# If there was an error downloading the file
# This sometimes happens if videos are blocked or taken down
if ret != 0:
create_error_file(data_path+subfolder,row[0], str(rownum - 3))
rownum += 1
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(segments, reader.line_num, e))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Directy download from youtube the videos and audio files of youtube audioset.')
parser.add_argument('--train', action='store_true')
parser.add_argument('--eval', action='store_true')
parser.add_argument('--unbalanced_train', action='store_true')
if len(sys.argv) < 2:
parser.print_usage()
sys.exit(1)
args = parser.parse_args()
# Only use what you need
if args.train:
print('Downloading balanced trainig datased defined in',balanced_train_segments)
download_data(balanced_train_segments,"balanced_train/")
if args.eval:
print('Downloading evaluation datased defined in',eval_segments)
download_data(eval_segments,"eval/")
if args.unbalanced_train:
print('Downloading unbalanced training datased defined in',unbalanced_train_segments)
download_data(unbalanced_train_segments,"2000unbalanced_train/")