This repository has been archived by the owner on Oct 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcript_generator.py
296 lines (219 loc) · 12.1 KB
/
transcript_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import os
from pydub import AudioSegment
import numpy as np
import json
import time
import db_manager
from file_names import *
from Azure import azure_file_management
from Azure import azure_transcribe
from Azure import azure_batch_transcribe
abbreviations = {
'Ppal.': 'principal',
'Dpto.': 'departamento',
'N.': 'numero',
'N.º': 'numero',
'N.º.': 'numero'}
transcripts_cache = None
previous_transcription = None
show_prints = False
def print_if_debugging(text):
if show_prints:
print(text)
def increase_sound_volume(sound, amount):
return sound + amount
def get_audio_duration(audio_url):
#Check that file exists
if not os.path.exists(audio_url):
return False
#Read file
sound = AudioSegment.from_file(audio_url)
return sound.duration_seconds
def replace_abbreviations(phrase):
phrase_no_abb = []
for word in phrase.split(" "):
if word in abbreviations:
word = abbreviations[word]
phrase_no_abb.append(word)
return " ".join(phrase_no_abb)
def same_timeframe_as_previous_question(ta_row, previous_ta_row):
if previous_ta_row is None:
return False
if get_first_appeared_and_duration(ta_row=ta_row) == \
get_first_appeared_and_duration(ta_row=previous_ta_row):
return True
else:
return False
def get_first_appeared_and_duration(ta_row, first_q_offset=0, previous_ta_row=None, next_ta_row=None):
q_first_appeared = ta_row['First appeared (seconds into survey)']-first_q_offset
#Sometimes duration is longer than it should (given back and forths), so we will choose duration = difference between next q starting point and current one, if duration reported is too long.
q_duration = ta_row['Total duration (seconds)']
if next_ta_row is not None:
next_q_first_appeared = next_ta_row['First appeared (seconds into survey)']-first_q_offset
if next_q_first_appeared-q_first_appeared<q_duration and next_q_first_appeared!=q_first_appeared: #Be sure they dont have the same starting point (grouped questions)
q_duration = next_q_first_appeared-q_first_appeared
return q_first_appeared, q_duration+1
def previous_transcript_to_none():
global previous_transcription
previous_transcription = None
def tasks_are_equal(task1, task2):
#Check if any of the keys is different
for key in ['audio_url', 'offset', 'duration']:
if task1[key] != task2[key]:
return False
return True
def get_equivalent_succeded_question(transcript_tasks_db, project, case_id, q_code, task):
'''
Check if transcript_tasks_db has a task for a question with same timeframe as q_code
'''
##Check we are not speaking about the same question, that questions have same timeframe and that other question is succeded
for other_question in transcript_tasks_db[project][case_id]:
if other_question != q_code and \
tasks_are_equal(task, transcript_tasks_db[project][case_id][other_question]) and \
transcript_tasks_db[project][case_id][other_question]['status'] == 'SUCCEDED':
return other_question
return None
def run_live_transcriptions(language):
#Load transcripts tasks
global transcript_tasks_db
transcript_tasks_db = db_manager.load_database(TRANSCRIPT_TASKS_DB_FILE_NAME)
#Load transcripts cache
global transcript_cache
transcript_cache = db_manager.load_database(TRANSCRIPTS_CACHE_FILE_NAME)
#Find tasks in pending status
total_n_pending_tasks=0
for project in transcript_tasks_db.keys():
for case_id in transcript_tasks_db[project].keys():
for q_code in transcript_tasks_db[project][case_id].keys():
if transcript_tasks_db[project][case_id][q_code]['status']=='PENDING':
total_n_pending_tasks+=1
#Find tasks in pending
task_i=1
for project in transcript_tasks_db.keys():
for case_id in transcript_tasks_db[project].keys():
for q_code in transcript_tasks_db[project][case_id].keys():
if transcript_tasks_db[project][case_id][q_code]['status']=='PENDING':
print(f'Working task {task_i}/{total_n_pending_tasks}')
start_time = time.time()
task = transcript_tasks_db[project][case_id][q_code]
transcript = None
#We know that some tasks are repetitive, in the sense that some questions have the same timeframe than others. This happens for grouped questions
#If that is the case, rather than generating the transcript twice, we will copy the transcript from the equivalent question
equivalent_succeded_q = get_equivalent_succeded_question(transcript_tasks_db, project, case_id, q_code, task)
if equivalent_succeded_q:
equivalent_transcript = db_manager.get_element_from_database(transcript_cache, project, case_id, equivalent_succeded_q)
if equivalent_transcript:
transcript = equivalent_transcript
print(f'For {project} {case_id} {q_code}, we will use same transcript as {equivalent_succeded_q}')
# print(transcript)
else:
raise ValueError(f"Succeding task does not have transcript {project} {case_id} {equivalent_succeded_q}")
#If we did not get transcript from an equivalent question, calculate it
if transcript is None:
#Create audio file for this task
choped_wav_file_path = create_choped_wav(
audio_url = task['audio_url'],
offset = task['offset'],
duration = task['duration'])
#Generate transcript
transcript = azure_transcribe.generate_transcript(choped_wav_file_path, language, show_debugging_prints=False)
print(f'Transcript for {project} {case_id} {q_code} ready')
print(transcript)
#Remove audio chop
os.remove(choped_wav_file_path)
#Save transcript
db_manager.save_to_db(
database=transcript_cache,
database_file_name=TRANSCRIPTS_CACHE_FILE_NAME,
project_name=project,
case_id=case_id,
q_code=q_code,
element_to_save=transcript)
#Change task status
transcript_tasks_db[project][case_id][q_code]['status'] = 'SUCCEDED'
db_manager.save_db(transcript_tasks_db, TRANSCRIPT_TASKS_DB_FILE_NAME)
task_i+=1
end_time = time.time()
print(f'Task took {end_time-start_time} seconds')
print(f"Audio file duration {task['duration']}")
print(f"{task['duration']/(end_time-start_time)} audio data processes per sec")
def create_choped_wav(audio_url, offset, duration):
#Read file
sound = AudioSegment.from_file(audio_url)
#Chop sound according to offset and duration given
if offset is not None and duration is not None:
sound = sound[offset*1000:(offset+duration)*1000]##pydub works in milliseconds
#Transform to .wav
AUDIO_FILE_WAV = "transcript.wav"
out = sound.export(AUDIO_FILE_WAV, format="wav")
out.close()
return AUDIO_FILE_WAV
def launch_transcript_tasks(trancript_engine, language):
#Load transcripts tasks
global transcript_tasks_db
transcript_tasks_db = db_manager.load_database(TRANSCRIPT_TASKS_DB_FILE_NAME)
if trancript_engine == 'azure_batch':
#Find tasks in data_uplodaded status
for project in transcript_tasks_db.keys():
for case_id in transcript_tasks_db[project].keys():
for q_code in transcript_tasks_db[project][case_id].keys():
if transcript_tasks_db[project][case_id][q_code]['status']=='DATA_UPLOADED':
transcription_id = azure_batch_transcribe.launch_transcription(locale=language, container_name = 'mycontainer', blob_name = transcript_tasks_db[project][case_id][q_code]['blob_name'])
transcript_tasks_db[project][case_id][q_code]['status'] = 'TRANSCRIPTION_IN_PROGRESS'
transcript_tasks_db[project][case_id][q_code]['transcription_id'] = transcription_id
db_manager.save_db(transcript_tasks_db, TRANSCRIPT_TASKS_DB_FILE_NAME)
def get_transcription_results(trancript_engine):
#Load transcripts tasks
global transcript_tasks_db
transcript_tasks_db = db_manager.load_database(TRANSCRIPT_TASKS_DB_FILE_NAME)
#Load transcripts cache
global transcript_cache
transcript_cache = db_manager.load_database(TRANSCRIPTS_CACHE_FILE_NAME)
if trancript_engine == 'azure_batch':
#Find tasks in TRANSCRIPTION_IN_PROGRESS status
for project in transcript_tasks_db.keys():
for case_id in transcript_tasks_db[project].keys():
for q_code in transcript_tasks_db[project][case_id].keys():
if transcript_tasks_db[project][case_id][q_code]['status']=='TRANSCRIPTION_IN_PROGRESS':
transcription_id = transcript_tasks_db[project][case_id][q_code]['transcription_id']
result = azure_batch_transcribe.get_transcription_result(transcription_id=transcription_id)
if result:
transcript_tasks_db[project][case_id][q_code]['status'] = 'SUCCEDED'
transcript_cache[project][case_id][q_code] = result
db_manager.save_db(transcript_tasks_db, TRANSCRIPT_TASKS_DB_FILE_NAME)
db_manager.save_db(transcript_cache, TRANSCRIPTS_CACHE_FILE_NAME)
#Now we can delete the blob in azure
azure_file_management.delete_blob(container_name = 'mycontainer', blob_name=transcript_tasks_db[project][case_id][q_code]['blob_name'])
else:
transcript_tasks_db[project][case_id][q_code]['status'] = 'FAILED'
db_manager.save_db(transcript_tasks_db, TRANSCRIPT_TASKS_DB_FILE_NAME)
def upload_transcript_tasks_audio_files(trancript_engine):
#Load transcripts tasks
global transcript_tasks_db
transcript_tasks_db = db_manager.load_database(TRANSCRIPT_TASKS_DB_FILE_NAME)
if trancript_engine == 'azure_batch':
#Find tasks in pending status
for project in transcript_tasks_db.keys():
for case_id in transcript_tasks_db[project].keys():
for q_code in transcript_tasks_db[project][case_id].keys():
if transcript_tasks_db[project][case_id][q_code]['status']=='PENDING':
task = transcript_tasks_db[project][case_id][q_code]
#Create audio file for this task
choped_wav_file_path = create_choped_wav(
audio_url = task['audio_url'],
offset = task['offset'],
duration = task['duration'])
#Upload files to container
blob_name = f'{project}_{case_id}_{q_code}'
upload_status = azure_file_management.upload_blob(
file_path = choped_wav_file_path,
container_name = 'mycontainer',
blob_name = blob_name)
#Remove audio chop
os.remove(choped_wav_file_path)
#Change task status
transcript_tasks_db[project][case_id][q_code]['status'] = 'DATA_UPLOADED'
transcript_tasks_db[project][case_id][q_code]['blob_name'] = blob_name
db_manager.save_db(transcript_tasks_db, TRANSCRIPT_TASKS_DB_FILE_NAME)
if not upload_status:
print(f'Error when uploading {project} {case_id} {q_code}')