config.ini

[preprocess]
# ==================================================================================
# [Related file] preprocess.py, extractor.py
# [Note] Please check and customize configurations before you run the related file.
# ==================================================================================

# [Description] Directory having raw Youtube video files.
src_vid_dir = /hdd/Video
# [Description] Directory having raw audio files from corresponding video in src_vid_dir.
src_aud_dir = /hdd/Audio

# [Description] Directory to have extracted video frames in npz format.
dst_vid_dir = /hdd/data/video
# [Description] Directory to have extracted spectorgrams in npz format.
dst_aud_dir = /hdd/data/audio

# [Description] Extension of raw video files in <src_vid_dir>.
# [Note] Please be aware that dot should be included.
vid_ext = .mp4
# [Description] Extension of raw audio files in <src_aud_dir>.
# [Note] Please be aware that dot should be included.
aud_ext = .wav

# [Description] File head name convention of raw video files in <src_vid_dir>.
# [Note] The name of video file finally must be like "<vid_fname_head><vid_id>.<vid_ext>".
#        Here, <vid_id> means video ID in Youtube, and this is automatically given from Youtube URL.
# [Note] Naming convention of video files should have been set before you download videos from Youtube via youtube_dl.
# [Note] If there is no video file head name, leave this argument blank.
# [Warning] Video files which do not follow this name convention above will be rejected.
vid_fname_head = video_
# [Description] File head name convention of raw audio files in src_aud_dir.
# [Note] The name of audio file finally must be like "<aud_fname_head><aud_id>.<aud_ext>".
#        Here, <aud_id> means video ID in Youtube, and this is automatically given from Youtube URL.
# [Note] Naming convention of audio files should have been set before you retrieve audio from video files via ffmpeg.
# [Note] If there is no audio file head name, leave this argument as blank.
# [Warning] Audio files which do not follow this name convention above will be rejected.
aud_fname_head = audio_

# [Description] The number of processes (threads) to be run in parallel for extraction.
# [Note] If set to 0, the number of processes will be automatically set to the number of cpu cores.
ncpu = 6
# [Description] Whether to run video frame extraction or not.
# [Note] You can set this to False or false if you already have preprocessed audio spectrograms in npz format.
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
run_vid = True
# [Description] Whether to run audio spectrogram extraction or not
# [Note] You can set this to False or false if you already have preprocessed video files in npz format
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
run_aud = True

# [Description] Whether to discard raw video or audio files whose pair does not exist, before the preprocessing steps.
# [Note] Pair check is done by using <vid_id> and <aud_id>.
# [Note] If set to True, video and audio files in <src_vid_dir> and <src_aud_dir> whose pair does not exist will be deleted.
# [Warning] Please be aware that setting this to True may delete some unpaired raw video and audio files.
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
remove_unpaired_raw = True
# [Description] Whether to remove npz files whose corresponding video or audio failed in preprocessing.
# [Note] If set to True, preprocessed npz files in <dst_vid_dir> and <dst_aud_dir> will be deleted
#        when preprocessing steps in corresponding video and audio files have been unsucessfully finished.
# [Note] File IDs to be deleted will be read from <fail_fname>.
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
remove_failure = True
# [Description] 
remove_unpaired_npz = True
# [Description] File path where <vid_id> and <aud_id> failed in extraction will be saved.
# [Note] If you do not want to dump the failure list, leave this argument blank.
failure_fname = ./csv/failure.csv

# [Description] Whether to move or copy npz files into in 3 different folders: train, val, and test
make_train_val_test_split = True
# [Description]
train_vid_dir = ./data/train/video
train_aud_dir = ./data/train/audio
# [Description]
val_vid_dir = ./data/val/video
val_aud_dir = ./data/val/audio
# [Description]
test_vid_dir = ./data/test/video
test_aud_dir = ./data/test/audio
# [Description]
total = 60000
# [Description]
val_size = 0.1
# [Description]
test_size = 0.1
# [Description]
random_seed = 2020
# [Description]
mode = copy

# ==============================================================================================================
# [Note] Preprocess configurations from below are recommended not to be changed.
#        Please change configurations below only if you are familiar with preprocessing steps in this project.
# ==============================================================================================================

# [Description] Number of time position (in second) where the extraction starts.
start_pos = 0.0
# [Description] Number of time interval (in second) to create one segment.
interval = 1.0
# [Description] Number of segments to extract per file.
# [Note] Please make sure if the video time length satifies: video_time_length >= <start_pos> + <interval> * <nseg>
# [Note] Time length check will be automatically done in preprocessing steps.
nseg = 9

# [Description] Whether RandomCrop will be applied for image augmentation.
# [Note] If True, the height and width of video frames will be resized to (256, 256).
# [Note] If False, they will be resized to (224, 224), the expected input shape of ImageConvNet.
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
randomcrop = True

# [Description] Sampling rate of audio files
# [Note] Please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.spectrogram.html.
sr = 48000
# [Description] Windows size to be used in Fourier transformation
# [Note] Default value is 480, which is 0.01s window size when sampling rate is 48kHz.
# [Note] Please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.spectrogram.html.
winsize = 480
# [Description] Ratio of overlapping in the window
# [Note] Default value is the half of window size, which is 480 * 0.5 = 240 when using default <winsize>.
# [Note] Please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.spectrogram.html.
overlap = 0.5
# [Description] nfft value to be used for generating spectorgram.
# [Note] Default value is the nearst power of two of <winsize>, which is 512.
# [Note] Please refer to https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.spectrogram.html.
nfft = 512
# [Description] Whether to convert spectrogram into log-scale (magnitude to decibel unit).
# [Warning] Only True, true, False, false will be accepted, error will be raised otherwise.
logscale = True
# [Description] A small value to be added before changing the spectrogram into log-scale to prevent zero log.
# [Note] If <logscale> is True, spectrogram will be converted into log-scale by following: 10 * log(spectrogram + eps)
eps = 1e-7


[train]
# related file: train.py
# please check and customize configurations before you run the related file