-
Notifications
You must be signed in to change notification settings - Fork 1
/
synthesize.py
193 lines (160 loc) · 8.38 KB
/
synthesize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import json
import random
import re
import regex
from klpt.preprocess import Preprocess
'''
Created in October 2022
This script generates synthetic data based on a character alignment matrix and using a corpus.
Make sure the config.json file points to the correct files before running.
- Sina Ahmadi (last updated December 2022)
'''
def tsv_to_dict(text):
# convert the script map to a dctionary
text_dict = dict()
for i in text:
i_s = i.split("\t")[0] # source letter
if i_s not in text_dict:
text_dict[i_s] = list()
for j in range(1, len(i.split("\t"))):
if i.split("\t")[j] != "":
i_t = i.split("\t")[j] # target letter
if i_t == "NULL":
i_t = ""
if i_t not in text_dict[i_s]:
text_dict[i_s].append(i_t) # a big value to make it more impactful
return text_dict
def preprocess_corpus(text):
# clean a corpus and return a dataset
# unify numerals to latin
preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Latin")
text = preprocessor_ckb.unify_numerals(text)
# clean corpus by removing acronyms (x.x or x.x.x)
clean_text = re.sub(r".\..\..\..", "", text)
clean_text = re.sub(r".\..\..", "", clean_text)
clean_text = clean_text.replace(" ", " ").replace("؟ ", "\n").replace("! ", "\n").replace(": ", "\n").replace("* ", " ").replace("۔", "\n").replace(". ", "\n")
# remove dates
clean_text = clean_text.replace(" / ", "/").replace(" . ", ".").replace("...", ".")
clean_text = re.sub(r"([1-9]|0[1-9]|[12][0-9]|3[01])[- /.]([1-9]|0[1-9]|1[012])[- /.]\d\d\d\d", "", clean_text)
# remove links
clean_text = re.sub(r'https?:\/\/.*[\r\n]*', '', clean_text, flags=re.MULTILINE)
return clean_text
def generate(text, character_map, noise_percentage=100):
keys = list(character_map.keys())
random.Random(10).shuffle(keys)
character_map = {key: character_map[key] for key in keys}
# Determine the number of characters that should be turned noisy, i.e. mapped with noisy equivalents, to meet the synthesis level
text_set = set(text)
num_replacements = round(len(text_set) * noise_percentage / 100)
added_noise = 0
for i in text_set:
if not added_noise <= num_replacements:
break
if i in character_map:
# note: this can be modified in such a way that the length of the letters be taken into account: first longer replacements, then shorter ones.
text = text.replace(i, random.choice(character_map[i]))
added_noise += 1
if added_noise == 0:
return None
return text.replace("▁", "")
def save_datasets(dataset, noisiness, save_path):
# save as train, dev and test
split_name_ratio = {"train": (0, int(len(dataset) * 80 / 100)),
"dev": (int(len(dataset) * 80 / 100), int(len(dataset) * 90 / 100)),
"test": (int(len(dataset) * 90 / 100), len(dataset))}
for d in split_name_ratio:
with open(save_path + "/%s/%s.src"%(noisiness, d), "w") as f:
f.write("\n".join([m[0] for m in dataset[split_name_ratio[d][0]: split_name_ratio[d][1]]]))
with open(save_path + "/%s/%s.trg"%(noisiness, d), "w") as f:
f.write("\n".join([m[1] for m in dataset[split_name_ratio[d][0]: split_name_ratio[d][1]]]))
print("Saved!")
def clean_text(text, has_zwnj=False, has_diacritics=False):
if not has_zwnj:
text = text.replace("", "")
if not has_diacritics:
for i in [ "ً", "ِ", "ٌ", "ُ", "ّ", "ٍ", "ْ", "ء"]:
text = text.replace(i, "")
return text.replace("", " ").replace("", " ").replace("ـ", "")
if __name__ == '__main__':
with open("config.json", "r") as f:
configs = json.load(f)
with open("../data/scripts/info.json", "r") as f:
info = json.load(f)
for config in configs:
print(config["source_language"], " ==== ", config["target_language"])
with open("../" + config["script_map"], "r") as f:
script_map = f.read().splitlines()[1:]
# convert the tsv format of the script mapping to a dictionary
script_map = tsv_to_dict(script_map)
# print(generate("ئەمە بۆ تست کردنە.", script_map))
# extract sentences from the corpus
with open("../" + config["corpus"], "r") as f:
corpus = f.read()
# extract senteces from the corpus and clean it
corpus = preprocess_corpus(corpus).splitlines()
# split sentences > to 20 to smaller ones
for i in corpus:
if len(i.split()) > 20:
i.split()
# create data instances having a length of less than 20 tokens (space-delimited)
corpus_sent = list()
for i in corpus:
# clean i to remove non-Perso-Arabic text
latin = regex.sub(r'[^\p{Latin}]', ' ', i).split() # find words in the Latin script
hindi = re.findall(r'[\u0900-\u097f\ua8e0-\ua8ff]+', i) # find words in Devanagari
for l in latin + hindi:
i = i.replace(l, " ")
# add space before and after punctuation marks
for c in ".؟،!؛:ː۔":
i = i.replace(c, " " + c + " ")
i = " ".join(i.split())
for c in "-ـ<>«»(){}[]/+٪'\"$&ː… ͡":
i = i.replace(c, " ")
i = " ".join(i.split())
if len(i.split()) >= 3 and len(i.split()) < 20 and len(i) >= 10:
if "http" not in i and "www" not in i and "@" not in i:
corpus_sent.append(i.strip())
elif len(i.split()) > 20 and len(i) >= 10:
for j in range(0, len(i.split()), 10):
if j+10 < len(i):
i_j = i[j:j+10]
else:
i_j = i[j:]
if len(i_j.split()) > 5 and len(i_j) >=10:
if "http" not in i_j and "www" not in i_j and "@" not in i_j:
corpus_sent.append(i_j.strip())
print("Size of the corpus initially: ", len(corpus))
print("Size of the corpus after cleaning: ", len(corpus_sent))
print("# Generating data...")
# generate synthetic data, clean the target text and save the dataset
preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Latin")
preprocessor_kmr = Preprocess("Sorani", "Arabic", numeral="Latin")
for n in [20, 40, 60, 80, 100]:
print('Generate synthetic data with noise % of ', str(n))
synth_dataset = list()
for i in corpus_sent:
# tuples likes (noisy sent--source, clean sent--target)
synth_i = generate(i, script_map, noise_percentage=n)
if synth_i != None:
# clean the target
clean_i = clean_text(i, has_zwnj=info[config["source_language"]]["zwnj"], has_diacritics=info[config["source_language"]]["diacritics"])
if config["source_language"] == "Sorani":
clean_i = preprocessor_ckb.preprocess(clean_i)
elif config["source_language"] == "Kurmanji":
clean_i = preprocessor_kmr.preprocess(clean_i)
synth_dataset.append((synth_i, clean_i))
save_datasets(list(set(synth_dataset)), str(n), "../" + config["datasets"])
print("# Generating merged data...")
# merge all the datasets with various noise % and save in the "all" folder
merged_data_src, merged_data_tgt = list(), list()
for m in ["train", "dev", "test"]:
for n in [20, 40, 60, 80, 100]:
with open("../" + config["datasets"] + "/%s/%s.src"%(n, m), "r") as f:
merged_data_src.append(f.read())
with open("../" + config["datasets"] + "/%s/%s.trg"%(n, m), "r") as f:
merged_data_tgt.append(f.read())
merged_data_src = "\n".join(merged_data_src).splitlines()
merged_data_tgt = "\n".join(merged_data_tgt).splitlines()
merged_data = list(set([(merged_data_src[i], merged_data_tgt[i]) for i in range(len(merged_data_src))]))
print("Total number of merged data instances: ", len(merged_data), len(merged_data))
save_datasets(merged_data, "1", "../" + config["datasets"])