Skip to content

Commit 1e3f44f

Browse files
committed
add ner keyword fields and shorten test xml
1 parent 81cd3c8 commit 1e3f44f

File tree

4 files changed

+69
-44900
lines changed

4 files changed

+69
-44900
lines changed

backend/corpora/parliament/conftest.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,9 +284,13 @@ def parliament_corpora_settings(settings):
284284
"page": None,
285285
"url": None,
286286
"sequence": 1,
287+
"ner:location": [],
288+
"ner:miscellaneous": [],
289+
"ner:organization": ["Economische Zaken"],
290+
"ner:person": [],
287291
}
288292
],
289-
"n_documents": 98,
293+
"n_documents": 2,
290294
"start": datetime(2015, 1, 1),
291295
},
292296
{

backend/corpora/parliament/netherlands.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
ner_keyword_field,
1717
party_attribute_extractor,
1818
person_attribute_extractor,
19-
detokenize_parlamint,
19+
extract_speech,
2020
speech_ner,
2121
)
2222
from corpora.utils.formatting import format_page_numbers
@@ -104,6 +104,21 @@ def get_sequence_recent(id):
104104
return int(match.group(1))
105105

106106

107+
def extract_named_entities(xml_file: str) -> dict:
108+
with open(xml_file) as f:
109+
soup = bs4.BeautifulSoup(f)
110+
speeches = soup.find_all("u")
111+
output = dict()
112+
for speech in speeches:
113+
annotations_dict = {"LOC": list(), "MISC": list(), "ORG": list(), "PER": list()}
114+
annotations = speech.find_all("name")
115+
for annotation in annotations:
116+
annotated = " ".join([word.string for word in annotation.find_all("w")])
117+
annotations_dict[annotation["type"]].append(annotated)
118+
output[speech["xml:id"]] = annotations_dict
119+
return output
120+
121+
107122
class ParliamentNetherlandsNew(XMLCorpusDefinition):
108123
min_date = datetime(year=2015, month=1, day=1)
109124
max_date = datetime(year=2022, month=12, day=31)
@@ -126,6 +141,7 @@ def sources(self, start: datetime, end: datetime):
126141
}
127142
for year in range(start.year, end.year):
128143
for xml_file in glob("{}/{}/*.xml".format(self.data_directory, year)):
144+
metadata["ner"] = extract_named_entities(xml_file)
129145
yield xml_file, metadata
130146

131147
country = field_defaults.country()
@@ -181,10 +197,9 @@ def sources(self, start: datetime, end: datetime):
181197
speech = field_defaults.speech(language="nl")
182198
speech.extractor = XML(
183199
Tag("seg"),
184-
Tag(["w", "pc"]),
185200
multiple=True,
186-
extract_soup_func=lambda x: x,
187-
transform=detokenize_parlamint,
201+
extract_soup_func=extract_speech,
202+
transform=lambda x: "\n".join(x),
188203
)
189204

190205
speech_id = field_defaults.speech_id()

0 commit comments

Comments
 (0)