Skip to content

Commit

Permalink
add ner keyword fields and shorten test xml
Browse files Browse the repository at this point in the history
  • Loading branch information
BeritJanssen committed Oct 24, 2024
1 parent 81cd3c8 commit 1e3f44f
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 44,900 deletions.
6 changes: 5 additions & 1 deletion backend/corpora/parliament/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,9 +284,13 @@ def parliament_corpora_settings(settings):
"page": None,
"url": None,
"sequence": 1,
"ner:location": [],
"ner:miscellaneous": [],
"ner:organization": ["Economische Zaken"],
"ner:person": [],
}
],
"n_documents": 98,
"n_documents": 2,
"start": datetime(2015, 1, 1),
},
{
Expand Down
23 changes: 19 additions & 4 deletions backend/corpora/parliament/netherlands.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
ner_keyword_field,
party_attribute_extractor,
person_attribute_extractor,
detokenize_parlamint,
extract_speech,
speech_ner,
)
from corpora.utils.formatting import format_page_numbers
Expand Down Expand Up @@ -104,6 +104,21 @@ def get_sequence_recent(id):
return int(match.group(1))


def extract_named_entities(xml_file: str) -> dict:
with open(xml_file) as f:
soup = bs4.BeautifulSoup(f)
speeches = soup.find_all("u")
output = dict()
for speech in speeches:
annotations_dict = {"LOC": list(), "MISC": list(), "ORG": list(), "PER": list()}
annotations = speech.find_all("name")
for annotation in annotations:
annotated = " ".join([word.string for word in annotation.find_all("w")])
annotations_dict[annotation["type"]].append(annotated)
output[speech["xml:id"]] = annotations_dict
return output


class ParliamentNetherlandsNew(XMLCorpusDefinition):
min_date = datetime(year=2015, month=1, day=1)
max_date = datetime(year=2022, month=12, day=31)
Expand All @@ -126,6 +141,7 @@ def sources(self, start: datetime, end: datetime):
}
for year in range(start.year, end.year):
for xml_file in glob("{}/{}/*.xml".format(self.data_directory, year)):
metadata["ner"] = extract_named_entities(xml_file)
yield xml_file, metadata

country = field_defaults.country()
Expand Down Expand Up @@ -181,10 +197,9 @@ def sources(self, start: datetime, end: datetime):
speech = field_defaults.speech(language="nl")
speech.extractor = XML(
Tag("seg"),
Tag(["w", "pc"]),
multiple=True,
extract_soup_func=lambda x: x,
transform=detokenize_parlamint,
extract_soup_func=extract_speech,
transform=lambda x: "\n".join(x),
)

speech_id = field_defaults.speech_id()
Expand Down
Loading

0 comments on commit 1e3f44f

Please sign in to comment.