-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_DIAL2MSA.py
104 lines (79 loc) · 3.38 KB
/
prepare_DIAL2MSA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import logging
import zipfile
import pandas as pd
from pathlib import Path
from urllib.request import urlretrieve
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
BASE_DATASET_DIR = "data/DIAL2MSA"
DATASET_URL = "https://alt.qcri.org/~hmubarak/EGY-MGR-LEV-GLF-2-MSA.zip"
DISTINCTIVE_TERMS_URL = "https://alt.qcri.org/~hmubarak/EGY-MGR-LEV-GLF-StrongWords.zip"
DIALECT_ABBREV_TO_NAME_MAP = {
"EGY": "Egyptian",
"GLF": "Gulf",
"LEV": "Levantine",
"MGR": "Maghrebi",
}
def download_DIAL2MSA():
# Create the dataset directory
os.makedirs(BASE_DATASET_DIR, exist_ok=True)
# Retrieve the dataset
dataset_filename = str(Path(BASE_DATASET_DIR, DATASET_URL.split("/")[-1]))
urlretrieve(DATASET_URL, dataset_filename)
with zipfile.ZipFile(dataset_filename, "r") as zip_ref:
zip_ref.extractall(BASE_DATASET_DIR)
distinctive_terms_filename = str(
Path(BASE_DATASET_DIR, DISTINCTIVE_TERMS_URL.split("/")[-1])
)
urlretrieve(DISTINCTIVE_TERMS_URL, distinctive_terms_filename)
with zipfile.ZipFile(distinctive_terms_filename, "r") as zip_ref:
zip_ref.extractall(BASE_DATASET_DIR)
def load_distinctive_terms(dialect):
assert dialect in DIALECT_ABBREV_TO_NAME_MAP.values()
words_list_filename = str(Path(BASE_DATASET_DIR, f"{dialect}StrongWords.txt"))
with open(words_list_filename, "r") as f:
WORDS = set([l.strip() for l in f])
return WORDS
def load_DIAL2MSA_dataset(dialect_abbrev):
assert dialect_abbrev in DIALECT_ABBREV_TO_NAME_MAP.keys()
try:
filename = next(Path(BASE_DATASET_DIR).glob(f"{dialect_abbrev}2MSA*.xlsx"))
except:
filename = next(Path(BASE_DATASET_DIR).glob(f"{dialect_abbrev}2MSA*.xls"))
df = pd.read_excel(filename)
dialect_text_column = "cleanedtweet" if "cleanedtweet" in df.columns else "tweet"
msa_text_column = "msa"
logging.info(f"Size of the original '{dialect_abbrev}' dataset is: {df.shape[0]}")
confidence_column = f"{dialect_abbrev.lower()}tomsa:confidence"
if confidence_column in df:
df = df[df[confidence_column] == 1.0].copy()
logging.info(
f"Size of '{dialect_abbrev}' dataset after dropping samples of confidence != 1.0 is: {df.shape[0]}"
)
dialect_name = DIALECT_ABBREV_TO_NAME_MAP[dialect_abbrev]
DIALECT_DISTINCTIVE_TERMS = load_distinctive_terms(dialect_name)
df = df[
~df[msa_text_column].apply(
lambda s: any([w in DIALECT_DISTINCTIVE_TERMS for w in str(s).split()])
)
]
logging.info(
f"Size of '{dialect_abbrev}' dataset after dropping translations having a distinctive term is: {df.shape[0]}"
)
# Note: MSA transltions within the dataset are not too long (< 50 tokens) making them adequate sentences
df.rename(
columns={msa_text_column: "MSA_text", dialect_text_column: "DA_text"},
inplace=True,
)
# TODO: Try to keep translations of the same tweet in the same split
df[["DA_text", "MSA_text"]].to_csv(
Path(BASE_DATASET_DIR, f"{dialect_abbrev}.tsv"), index=False, sep="\t"
)
def main():
# TODO: The levantine and Gulf xls files have issues
# The current solution is to open the file with LibreOffice and export it as xlsx
download_DIAL2MSA()
for dialect in ["EGY", "MGR"]:
load_DIAL2MSA_dataset(dialect)
if __name__ == "__main__":
main()