16
16
ner_keyword_field ,
17
17
party_attribute_extractor ,
18
18
person_attribute_extractor ,
19
- detokenize_parlamint ,
19
+ extract_speech ,
20
20
speech_ner ,
21
21
)
22
22
from corpora .utils .formatting import format_page_numbers
@@ -104,6 +104,21 @@ def get_sequence_recent(id):
104
104
return int (match .group (1 ))
105
105
106
106
107
+ def extract_named_entities (xml_file : str ) -> dict :
108
+ with open (xml_file ) as f :
109
+ soup = bs4 .BeautifulSoup (f )
110
+ speeches = soup .find_all ("u" )
111
+ output = dict ()
112
+ for speech in speeches :
113
+ annotations_dict = {"LOC" : list (), "MISC" : list (), "ORG" : list (), "PER" : list ()}
114
+ annotations = speech .find_all ("name" )
115
+ for annotation in annotations :
116
+ annotated = " " .join ([word .string for word in annotation .find_all ("w" )])
117
+ annotations_dict [annotation ["type" ]].append (annotated )
118
+ output [speech ["xml:id" ]] = annotations_dict
119
+ return output
120
+
121
+
107
122
class ParliamentNetherlandsNew (XMLCorpusDefinition ):
108
123
min_date = datetime (year = 2015 , month = 1 , day = 1 )
109
124
max_date = datetime (year = 2022 , month = 12 , day = 31 )
@@ -126,6 +141,7 @@ def sources(self, start: datetime, end: datetime):
126
141
}
127
142
for year in range (start .year , end .year ):
128
143
for xml_file in glob ("{}/{}/*.xml" .format (self .data_directory , year )):
144
+ metadata ["ner" ] = extract_named_entities (xml_file )
129
145
yield xml_file , metadata
130
146
131
147
country = field_defaults .country ()
@@ -181,10 +197,9 @@ def sources(self, start: datetime, end: datetime):
181
197
speech = field_defaults .speech (language = "nl" )
182
198
speech .extractor = XML (
183
199
Tag ("seg" ),
184
- Tag (["w" , "pc" ]),
185
200
multiple = True ,
186
- extract_soup_func = lambda x : x ,
187
- transform = detokenize_parlamint ,
201
+ extract_soup_func = extract_speech ,
202
+ transform = lambda x : " \n " . join ( x ) ,
188
203
)
189
204
190
205
speech_id = field_defaults .speech_id ()
0 commit comments