-
Notifications
You must be signed in to change notification settings - Fork 1
/
b_filter_ds.py
61 lines (39 loc) · 1.18 KB
/
b_filter_ds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# %%
import numpy as np
import matplotlib.pyplot as plt
from utils import read_lines_from_file, write_lines_to_file
from utils.letters import numerals, numerals_ar, abc_latin
from tqdm import tqdm
numerals_all = set(numerals + numerals_ar)
num_lat_all = set(numerals + numerals_ar + abc_latin)
# %%
t_min_secs = 0
t_max_secs = 25
# %%
segment_lines = read_lines_from_file('./data/all_segments.txt')
sep = '\t'
# %%
durations = []
utt_g1 = []
letters = set()
segments_rm = []
segments_keep = []
for idx, line in enumerate(tqdm(segment_lines)):
if line == '\n':
continue
wavid, t_start, t_end, _, *utterance = line.split(sep)
dur_secs = float(t_end) - float(t_start)
durations.append(dur_secs)
utterance = ' '.join(utterance) if len(utterance) > 1 else utterance[0]
letters.update(list(utterance))
if not (t_min_secs < dur_secs < t_max_secs) \
or set(utterance).intersection(num_lat_all):
segments_rm.append(line)
continue
segments_keep.append(line)
durations_np = np.array(durations)
# %%
write_lines_to_file('./data/all_segments_filt_nonumlat.txt', segments_keep)
# %%
plt.hist(durations_np[durations_np > 0])
segments_rm