-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate_pipeline_en.py
175 lines (147 loc) · 9.63 KB
/
evaluate_pipeline_en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import argparse
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
from glob import glob
from itertools import chain
from os import makedirs
from os.path import abspath, splitext, exists, basename, join
import numpy as np
import pandas as pd
from pattern3.metrics import levenshtein_similarity
from pipeline import pipeline, preprocess, vad
from util.corpus_util import get_corpus
from util.lm_util import load_lm, load_vocab
from util.log_util import create_args_str
from util.pipeline_util import query_asr_params, calculate_stats, create_demo_files, query_lm_params, update_index
from util.rnn_util import query_gpu
from util.visualization_util import visualize_pipeline_performance
parser = argparse.ArgumentParser(description="""
Evaluate the performance of a pipeline by calculating the following values for each entry in a test set:
- C: length of unaligned text (normalized by dividing by total length of ground truth)
- O: length of overlapping alignments (normalized by dividing by total legnth of all alignments)
- D: average Levenshtein distance over all alignments of the current entry
The values are averaged through division by the number of entries in the test set.
""")
parser.add_argument('--source_dir', type=str, required=False,
help=f'Path to directory containing pairs of audio files and transcripts to evaluate on.'
f'The audio format must be either WAV or MP3. The transcript must be a text file.'
f'Apart from the file extension, both audio and transcript file must have the same name to '
f'be identified as a pair.'
f'Either this or the --corpus argument must be set.')
parser.add_argument('--corpus', type=str, required=False,
help=f'Corpus path or ID to use for evaluation. If set, this will override the --source_dir '
f'argument. The elements from the test set of the respective corpus will be used for '
f'evaluation. Either this or the --source_dir argument must be set.')
parser.add_argument('--language', type=str, required=False,
help='(optional) language to use. Only considered in conjunction with --corpus')
parser.add_argument('--target_dir', type=str, required=False,
help=f'Path to target directory where results will be written. '
f'If not set, the source directory will be used.')
parser.add_argument('--keras_path', type=str, required=True,
help=f'Path to root directory where Keras model is stored (*.h5 file).')
parser.add_argument('--ds_path', type=str, required=True,
help=f'(optional) Path to pre-trained DeepSpeech model (*.pbmm file).')
parser.add_argument('--ds_alpha_path', type=str, required=True,
help=f'(optional) Path to text file containing alphabet of DeepSpeech model.')
parser.add_argument('--ds_trie_path', type=str, required=False,
help=f'(optional) Path to binary file containing trie for DeepSpeech model. '
f'Required if --ds_path is set')
parser.add_argument('--lm_path', type=str, required=False,
help=f'(optional) Path to binary file containing KenLM n-gram Language Model')
parser.add_argument('--vocab_path', type=str, required=False,
help=f'(optional) Path to vocabulary for LM')
parser.add_argument('--force_realignment', action='store_true',
help='force realignment of partial transcript with original transcript, even if alignment'
'information is available from previous runs.')
parser.add_argument('--align_endings', action='store_true',
help='align endings of partial transcripts, not just beginnings. If set to True, transcript may'
'contain unaligned parts between alignments. If set to False, each alignment ends where the'
'next one starts.')
parser.add_argument('--norm_transcript', action='store_true',
help='Normalize transcript before alignment. If set to True, the alignments will be more accurate'
'because the transcript does not contain any punctuation, annotations and other clutter. '
'However, this might not reflect how the pipeline will be used. If set to False, the '
'partial transcripts will be aligned will be aligned with the original transcript as-is, '
'resulting in possibly less accurate alignments, but the original transcript will not be '
'changed')
parser.add_argument('--gpu', type=str, required=False, default=None,
help='(optional) GPU(s) to use for training. If not set, you will be asked at runtime.')
args = parser.parse_args()
def main(args):
print(create_args_str(args))
demo_files, target_dir, keras_path, ds_path, ds_alpha, ds_trie, lm_path, vocab_path, normalize, gpu = setup(args)
num_files = len(demo_files)
print(f'Processing {num_files} audio/transcript samples. All results will be written to {target_dir}')
lm = load_lm(lm_path) if lm_path else None
vocab = load_vocab(vocab_path) if vocab_path else None
stats_keras, stats_ds = [], []
for i, (audio, transcript) in enumerate(demo_files):
print('-----------------------------------------------------------------')
print(f'{i + 1}/{num_files}: Evaluating pipeline on {audio}')
print('-----------------------------------------------------------------')
demo_id = splitext(basename(audio))[0]
target_dir_ds = join(target_dir, demo_id + '_ds')
target_dir_keras = join(target_dir, demo_id + '_keras')
audio_bytes, sample_rate, transcript, language = preprocess(audio, transcript, 'en', norm_transcript=normalize)
voiced_segments = vad(audio_bytes, sample_rate)
df_alignments_ds = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript,
language='en',
ds_path=ds_path, ds_alpha_path=ds_alpha, ds_trie_path=ds_trie,
lm_path=lm,
force_realignment=args.force_realignment, align_endings=args.align_endings,
target_dir=target_dir_ds)
df_stats_ds = calculate_stats(df_alignments_ds, ds_path, transcript)
df_alignments_keras = pipeline(voiced_segments=voiced_segments, sample_rate=sample_rate, transcript=transcript,
language='en',
keras_path=keras_path, lm=lm, vocab=vocab,
force_realignment=args.force_realignment, align_endings=args.align_endings,
target_dir=target_dir_keras)
df_stats_keras = calculate_stats(df_alignments_keras, keras_path, transcript)
# average similarity between Keras and DeepSpeech alignments
av_similarity = np.mean([levenshtein_similarity(al_keras, al_ds) for (al_keras, al_ds) in
zip(df_alignments_keras['alignment'], df_alignments_ds['alignment'])])
df_stats_ds['similarity'] = av_similarity
df_stats_keras['similarity'] = av_similarity
stats_ds.append(df_stats_ds)
stats_keras.append(df_stats_keras)
create_demo_files(target_dir_ds, audio, transcript, df_alignments_ds, df_stats_ds)
create_demo_files(target_dir_keras, audio, transcript, df_alignments_keras, df_stats_keras)
df_keras = pd.concat(stats_keras)
csv_keras = join(target_dir, 'performance_keras.csv')
df_keras.to_csv(csv_keras)
df_ds = pd.concat(stats_ds)
csv_ds = join(target_dir, 'performance_ds.csv')
df_ds.to_csv(csv_ds)
print(f'summary saved to {csv_keras}')
visualize_pipeline_performance(csv_keras, csv_ds, silent=True)
update_index(target_dir, lang='en', num_aligned=len(demo_files),
df_keras=df_keras, keras_path=keras_path,
df_ds=df_ds, ds_path=ds_path,
lm_path=lm_path, vocab_path=vocab_path)
print(f'Done! Demos have been saved to {target_dir}')
def setup(args):
if not args.source_dir and not args.corpus:
raise ValueError('ERROR: Either --source_dir or --corpus must be set!')
if args.corpus and not args.target_dir:
raise ValueError('ERROR: If --corpus is set the --target_dir argument must be set!')
if args.corpus:
corpus = get_corpus(args.corpus, args.language)
demo_files = [(entry.audio_path, entry.transcript_path) for entry in set(s.entry for s in corpus.test_set())]
target_dir = abspath(args.target_dir)
else:
source_dir = abspath(args.source_dir)
target_dir = abspath(args.target_dir) if args.target_dir else source_dir
demo_files = []
for audio_file in chain.from_iterable(glob(e) for e in (f'{source_dir}/*.{ext}' for ext in ('mp3', 'wav'))):
transcript_file = splitext(audio_file)[0] + '.txt'
if exists(transcript_file):
print(f'adding: {basename(audio_file)} / {basename(transcript_file)}')
demo_files.append((audio_file, transcript_file))
if not exists(target_dir):
makedirs(target_dir)
keras_path, ds_path, ds_alpha_path, ds_trie_path = query_asr_params(args)
lm_path, vocab_path = query_lm_params(args)
gpu = args.gpu if args.gpu else query_gpu()
return demo_files, target_dir, keras_path, ds_path, ds_alpha_path, ds_trie_path, lm_path, vocab_path, args.norm_transcript, gpu
if __name__ == '__main__':
main(args)