-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1-conllu-to-df.py
200 lines (145 loc) · 6.01 KB
/
1-conllu-to-df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import argparse
import zipfile
import gzip
import shutil
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("lang_code", help="lang code used in the files")
args = parser.parse_args()
# Define the language code, used in the file names
#lang_code = "CZ"
lang_code = args.lang_code
# Unzip the ZIP folder with the files
#with zipfile.ZipFile("/home/tajak/Parlamint-translation/Source-data/ParlaMint-{}/ParlaMint-{}.conllu.zip".format(lang_code, lang_code), 'r') as zip_ref:
# zip_ref.extractall("/home/tajak/Parlamint-translation/Source-data/ParlaMint-{}/ParlaMint-{}.conllu".format(lang_code, lang_code))
# Unzip the TGZ file: write to command line: `tar -xf dir_name`
# Main path
main_path = "/home/tajak/Parlamint-translation"
# Check in your directory whether the path to the folder with conllu files is ok:
path = "{}/Source-data/ParlaMint-{}.conllu/ParlaMint-{}.conllu".format(main_path, lang_code, lang_code)
# ------------NO CHANGING OF THE CODE NEEDED FROM NOW ONWARDS--------------
from knockknock import discord_sender
# Create a folder with results for this language, e.g. results/CZ
os.mkdir("/home/tajak/Parlamint-translation/results/{}".format(lang_code))
# Create (manually) a "temp" folder inside the results/CZ
os.mkdir("/home/tajak/Parlamint-translation/results/{}/temp".format(lang_code))
# Define final path
extracted_dataframe_path = "{}/results/{}/ParlaMint-{}-extracted-source-data.csv".format(main_path, lang_code, lang_code)
# Extract a list with paths to conllu files and a list with their names
parl_list = []
file_name_list = []
for dir1 in os.listdir(path):
full_path = os.path.join(path, dir1)
if os.path.isdir(full_path):
current = os.listdir(full_path)
# Keep only files with parliamentary sessions:
for file in current:
if "ParlaMint-{}_".format(lang_code) in file:
if ".conllu" in file:
final_path = "{}/{}".format(full_path, file)
parl_list.append(final_path)
file_name_list.append(file)
# See how many files we have:
print("No. of files: {}.".format(len(parl_list)))
# Get notified once the code ends
webhook_url = open("/home/tajak/Parlamint-translation/discord_key.txt", "r").read()
@discord_sender(webhook_url=webhook_url)
def conllu_to_df(parl_list, file_name_list, extracted_dataframe_path):
"""
Take the conllu files and extract relevant information. Save everything in a DataFrame.
Args:
- parl_list: list of documents with their entire paths to be included (see step above).
- file_name_list: list of names of the files (see step above)
- extracted_dataframe_path: path to the output file
"""
from conllu import parse
import pandas as pd
# Create an empty df
df = pd.DataFrame({"file_path": [""],"file": [""], "sentence_id": [""], "text": [""], "tokenized_text": [""], "proper_nouns": [""]})
# Check whether there are any problems with parsing the documents
"""
error_count = 0
problematic_doc_list = []
for doc in parl_list:
try:
# Open the file
data = open("{}".format(doc), "r").read()
sentences = parse(data)
except:
error_count += 1
problematic_doc_list.append(doc)
print(error_count)
print(problematic_doc_list)
"""
# Parse the data with CONLL-u parser
for doc in parl_list:
# Open the file
data = open("{}".format(doc), "r").read()
sentences = parse(data)
sentence_id_list = []
text_list = []
tokenized_text_list = []
proper_noun_list = []
for sentence in sentences:
# Find sentence ids
current_sentence_id = sentence.metadata["sent_id"]
sentence_id_list.append(current_sentence_id)
# Find text - if texts consists of multiword tokens, these tokens will appear as they are,
# not separated into subwords
current_text = sentence.metadata["text"]
text_list.append(current_text)
# Create a string out of tokens
current_token_list = []
word_dict = {}
for token in sentence:
# Find multiword tokens and take their NER
if type(token["id"]) != int:
multiword_ner = token["misc"]["NER"]
else:
# Append to the tokenized text tokens that are not multiword tokens
# (we append subtokens to the tokenized texts, not multiword tokens)
current_token_list.append(token["form"])
# Create a list of NE annotations with word indices.
# I'll substract one from the word index,
# because indexing in the CONLLU file starts with 1, not 0
current_index = int(token["id"]) - 1
# If the word does not have NER annotation,
# take the annotation from the multiword token
if token["misc"] is None:
current_ner = multiword_ner
else:
current_ner = token["misc"]["NER"]
# Add information on the lemma if the NE is personal name
if "PER" in current_ner:
word_dict[current_index] = [token["form"], token["lemma"]]
proper_noun_list.append(word_dict)
current_string = " ".join(current_token_list)
tokenized_text_list.append(current_string)
new_df = pd.DataFrame({"sentence_id": sentence_id_list, "text": text_list, "tokenized_text": tokenized_text_list, "proper_nouns": proper_noun_list})
new_df["file_path"] = doc
# Get the file name
file_name = file_name_list[parl_list.index(doc)]
new_df["file"] = file_name
# Merge df to the previous df
df = pd.concat([df, new_df])
# Reset index
df = df.reset_index(drop=True)
# Remove the first row
df = df.drop([0], axis="index")
# Reset index
df = df.reset_index(drop=True)
# Add information on length
df["length"] = df["text"].str.split().str.len()
print("Number of words in the corpora: {}".format(df["length"].sum()))
# Save the dataframe
df.to_csv("{}".format(extracted_dataframe_path), sep="\t")
print("Dataframe saved as {}".format(extracted_dataframe_path))
# Show the results
print(df.describe(include="all").to_markdown())
print("\n\n\n")
print(df.head().to_markdown())
print("\n\n\n")
return df
#Extract information from the conllu files
df = conllu_to_df(parl_list, file_name_list, extracted_dataframe_path)