-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_module.py
161 lines (122 loc) · 7.05 KB
/
text_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import argparse
from glob import glob
from os.path import join, dirname, isfile
from tqdm import tqdm
from text_tools.search_substring_with_threads import get_transcripts, text_cleaning, execute_threads_search_substring_by_char, execute_threads_search_substring_by_word
from text_tools.create_structure_folders import change_structure_folders
from text_tools.insert_punctuation import insert_punctuation_on_substring
from utils.download_dataset import download_language_dataset, download_books_dataset, extract_transcript_files, extract_book_files
from utils.utils import abbrev2language
def search_substring_with_punctuation(language_abbrev, transcript_file, complete_text_file, search_type, output_file, number_threads):
'''
Perform substring search only for files with low similarity.
'''
with open(transcript_file) as f:
transcripts_text = f.readlines()
older_transcript_text = False
if isfile (output_file):
with open(output_file) as f:
older_transcript_text = f.readlines()
with open(complete_text_file) as f:
book_text = f.read()
output_f = open(output_file, "w")
# Cleaning complete text_tools
book_text = text_cleaning(book_text)
start_position = 0
separator = '|'
total_similarity = 0
# Create ordered dict from transcripts list
transcripts_dict = get_transcripts(transcripts_text)
# Iterates over each transcription
for filename, text in tqdm(transcripts_dict.items()):
print('Processing {}'.format(filename))
# search for sentences already executed
if older_transcript_text:
result = [line.strip() for line in older_transcript_text if filename in line]
if len(result) > 0:
filename, text, text_result, similarity = result[0].split('|')
if search_type=='word' or float(similarity) > 0.9:
# Some information
print(text.strip())
print(text_result.strip())
print(similarity)
total_similarity += float(similarity)
line = separator.join([filename.strip(), text.strip(), text_result.strip(), str(similarity.strip()) + '\n'])
output_f.write(line)
continue
if search_type == 'char':
text_result, similarity, start_position = execute_threads_search_substring_by_char(language_abbrev, text, book_text, start_position=0, similarity_metric='hamming', total_threads=int(number_threads))
else:
text_result, similarity, start_position = execute_threads_search_substring_by_word(language_abbrev, text, book_text, start_position=0, similarity_metric='hamming', total_threads=int(number_threads))
if not text_result:
text_result = ''
total_similarity += similarity
# Some information
print(text.strip())
print(text_result.strip())
print(similarity)
# Write to file
line = separator.join([filename.strip(), text.strip(), text_result.strip(), str(similarity) + '\n'])
output_f.write(line)
print('Mean Similarity: {}'.format(total_similarity / len(transcripts_text)))
output_f.close()
def execution_text_convertion_pipeline(language_abbrev, input_folder, books_folder, search_type, threads_number):
language = abbrev2language[language_abbrev]
print('Downloading {} dataset tar.gz file...'.format(language_abbrev))
transcripts_tar_filename = download_language_dataset(lang=language_abbrev)
if not transcripts_tar_filename:
return False
print('Extracting files {}...'.format(transcripts_tar_filename))
transcript_files_list = extract_transcript_files(transcripts_tar_filename)
print('Downloading {} books tar.gz file...'.format(language_abbrev))
books_tar_filename = download_books_dataset(lang=language_abbrev)
print('Extracting files {}...'.format(books_tar_filename))
books_folder = extract_book_files(books_tar_filename)
# Run folder restructuring
for transcript_file in transcript_files_list:
print('Executing {} file'.format(transcript_file))
output_folder = join(dirname(transcript_file), 'audio') # output_folder = dirname(transcript_file)
change_structure_folders(transcript_file, output_folder)
# Run substring search in books
for transcript_file in glob(output_folder + '/**/**/transcripts.txt'):
# Defining output filepath
output_filepath = join(dirname(transcript_file), 'output_search.txt')
# Defining text book filepath
book_file = transcript_file.split('/')[-2]
complete_text_file = join(books_folder, language, book_file + '.txt')
# Verify if search was already done
if isfile(output_filepath):
with open(output_filepath) as f:
content_output = f.readlines()
with open(transcript_file) as f:
content_input = f.readlines()
if len(content_output) == len(content_input):
# Perform substring search only for files with low similarity
search_substring_with_punctuation(language_abbrev, transcript_file, complete_text_file, search_type, output_filepath, int(threads_number))
continue
# First execution of search
search_substring_with_punctuation(language_abbrev, transcript_file, complete_text_file, search_type, output_filepath, int(threads_number))
# Insert punctuation of the found substring in the transcript text
for metadata_search in tqdm(glob(output_folder + '/**/**/output_search.txt' )):
# Defining output filepath
output_filepath = join(dirname(metadata_search), 'output_result.txt')
# Run insertion
insert_punctuation_on_substring(language_abbrev, metadata_search, output_filepath)
print("Finished text conversion.")
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--base_dir', default='./')
parser.add_argument('-l', '--language', default='pt',
help='Options: pt (portuguese), pl (polish), it (italian), sp (spanish), fr (french), du (dutch), ge (german), en (english)')
parser.add_argument('-m', '--metric', default='hamming', help='Options: hamming (low accuracy, low computational cost), levenshtein (high accuracy, high computational cost) or ratcliff (average accuracy, average computational cost)')
parser.add_argument('-i', '--input_folder', default='./input/dev')
parser.add_argument('-c', '--books_folder', default='./lv_text/portuguese/')
parser.add_argument('-n', '--threads_number', default=4)
parser.add_argument('-t', '--search_type', default='word', help='Options: word or char')
parser.add_argument('-s', '--sequenced_text', action='store_true', default=False)
args = parser.parse_args()
input_folder = join(args.base_dir, args.input_folder)
books_folder = join(args.base_dir, args.books_folder)
execution_text_convertion_pipeline(args.language, input_folder, books_folder, args.search_type, args.threads_number)
if __name__ == "__main__":
main()