From cb03cf41d31eae477c329c51782a437ed1e0a361 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 15:22:10 +0100 Subject: [PATCH 01/39] add rds reader def --- .../corpora/parliament/utils/rds_reader.py | 22 +++++++++++++++++++ backend/requirements.in | 1 + backend/requirements.txt | 7 ++++-- 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 backend/corpora/parliament/utils/rds_reader.py diff --git a/backend/corpora/parliament/utils/rds_reader.py b/backend/corpora/parliament/utils/rds_reader.py new file mode 100644 index 000000000..a2ad5735b --- /dev/null +++ b/backend/corpora/parliament/utils/rds_reader.py @@ -0,0 +1,22 @@ +from typing import Iterable, Dict + +import pyreadr +import pandas + +from ianalyzer_readers.readers.core import Reader + + +class RDSReader(Reader): + def data_from_file(self, path) -> Iterable[Dict]: + result = pyreadr.read_r(path) + data: pandas.DataFrame = result['data'] + + for _, row in data.iterrows(): + yield { + index: value + for index, value in row.items() + } + + def iterate_data(self, data: Iterable[Dict], metadata): + for row in data: + yield {'rows': [row]} # this format is for compatability with the CSV extractor diff --git a/backend/requirements.in b/backend/requirements.in index 25a97e55e..4d4fe08e6 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -29,3 +29,4 @@ flower ianalyzer_readers jsonschema Pillow +pyreadr diff --git a/backend/requirements.txt b/backend/requirements.txt index 7bffbd6df..5955893fc 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile --cert=None --client-cert=None --index-url=None --pip-args=None +# pip-compile # amqp==5.3.1 # via kombu @@ -54,7 +54,7 @@ defusedxml==0.7.1 # djangosaml2 # pysaml2 # python3-openid -dj-rest-auth[with-social,with_social]==4.0.1 +dj-rest-auth[with-social]==4.0.1 # via -r requirements.in django==4.2.27 # via @@ -146,6 +146,7 @@ pandas==2.3.3 # via # -r requirements.in # ianalyzer-readers + # pyreadr pillow==11.3.0 # via -r requirements.in pluggy==1.6.0 @@ -168,6 +169,8 @@ pyparsing==3.2.5 # via rdflib pypdf==6.6.0 # via -r requirements.in +pyreadr==0.5.4 + # via -r requirements.in pysaml2==7.5.2 # via djangosaml2 pytest==8.4.2 From 4af228e0edb5770328e711a77ecd15326cc110c8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 15:33:21 +0100 Subject: [PATCH 02/39] add sample file --- .../parliament/tests/data/euparl/rds/sample.rds | Bin 0 -> 13598 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/corpora/parliament/tests/data/euparl/rds/sample.rds diff --git a/backend/corpora/parliament/tests/data/euparl/rds/sample.rds b/backend/corpora/parliament/tests/data/euparl/rds/sample.rds new file mode 100644 index 0000000000000000000000000000000000000000..57bca9e476e3172d708a531de9ae1444afc580ba GIT binary patch literal 13598 zcmeI3OOIVwR>#Y)ICcbdLrZ`_V)v*Lsa&$1L?HnQm+d@CvQ3B+!OYtC+cm&)UzHU517E0$cSNof)>C3+WXvls_e)@-BL@)kzDuQbM|9B z|7-2FcK+<@&d$#Ho%0v?%fFoqzrmm1;oq9r7J>M++oX1nbE`J!Cb{U&Y#--bB^dn7Y-kte*>Ar6Z&6_TF^?_eE zW$b-Du6X%Q+Z>nSByaHSfM=gvzT?+x-=AIk!r`X}d6f%?pNxil>gi;FeVvka{_xX0 z`~3S4#&6qaXTD8mUOMc3IWlnhuni>w&MTaH_xvAbH=_%`=#yn_{H-4*pT70Of9DbQ zw)s6i|1W#apDC%YB>D4HM)k_wVOGXbooC*i9(g~)lq8;Z|GRAEUq9gyyN|!e&maG_ zJ?GCU?#72_xg4MKIi>degF~K((hHw$b+UQ5eVx8dr(QUGaCe;h;9%73^yy?k`Z}Hd z+958o4)T*eFY|k!)i|WXf86^++pL3Y7Or0f_rUkO=<=}V%zW{qB-EdrXAOk;AHSPE z$KPi@c{wHZH}8h3DO!jqqMJweFxIi_ThA~_?$1GQx{rO^kCys^i7$Bl z-Fxif-TPJL-R-81W*k$xF1-8ZTbYK7H?F@u;`-$Aa zM_+cv-)BF0l?3-!e{17z{rUCuIsQKL$;%0W=Vo_fv5LNJTpQPI=)5ca26tVo`_Q&c z50A+09as4!Fz6I%TtLows$YkeAvSY9@Qsy@OftSKcae51nv!YP_uSgIe%bm(u0?R*sS;>df;#YKAWCv)(aM(o50Op*L5ZKv>g5qVv4LOG{ro zmd7r95eYz0Z<|%n>pMhwH>?MCIF4*ur=9Ptd^HR0zWel;ed{@+R=!-gli*vXEa9H5 zVJY!I+#dQ>)279bR}Jr4sndh8OIz6}nGLa=>xtn99M<93;ju%}4~*AFu~wK4bB(Vw z4u0D3#eildV>TDbm|h1WiDZ7wW9MgLZNda98+nkBG_PY1IFV5Jg9|SuXTApP=VTNQ zYnX;+0mGj|)R89u%u18CZ(EFQ?p9*2^vJksYS94Y-MT@iMv|mX3b2ZOC`2-EjdiF_ z1JIUl;ZX(<@H`+4sj#}QmzmFvIs&^@u{bF*31x?PrC~TPF?V#H*D=YRvbQ8nt-XpS zO>H-ii$yGk(k#IQkT~{jOg43j%f6ZW6D(z+jb=e>wc#2rTlCVBI6#r|n`lnDr)OxS zXprijTg0~OOBS$=4jmp}H@#bi-YwduGXB5$>Ic`q`RX6w<_H(ev1MZ`BXe3P50Pw| z!B$CvLxjbE`JJ#na4aI z+I#c&H}6dAbzH7`%nE~Ngl2b56|^JN7JLPI8s20B6;b={VJMnc=0N>wlUkGEMGf6>zWc? zr74iPz;IWgh6`5>#mcRf9ThU}aDwM|Y^}GX1---XOy%<3t&-$N#{u4jZl+-zMV_jB zp>^R)U#-DWiApwj^9IT+SZ=`czWM6!H!rkm70WWzbsQ>tg%Xdo{Nsow_yPH`F3O?P z4qa2kAbe!el$`K-YmJ7FC=`)|z%(1T7z8)_?)7`k03gVB2jC#qf`|3GG$Ft7ELbm; zU#v!@elf@g&D1^&oD_n)9okyv_i*Kp--GlYeA*y927&LbTM+jem=pORA5I{e<%X5U zbPzYgrXdyR@nrd{fI0B{m^C{&aCRbQYbK!})F=*##CkQbF7}Tb?Yln(GQ4CXrJ-^& z7xUxiMhw$tju|!p@r#DuBk`pI8dj4e$EYF%K{zgS8)|a^!hBbRh8`XFq47W6hZJD&__-)%_fu0CgkbC9dZl<9$@K3DojE6+&6!GHjZ1Q z?gleYg-Y@X-#+~siPcYbec6*4Rmj&9>Pm8uzK;k7ShGOR%h0XcSjh{?kSdap6p)mI z1b0Lf=LlO7ls!3OOCF^6?u&s~-MFX!;(SqfQl@6#ecW{T9KHySw!W(j&j~HNgex$* z^_Cdzk#rD_B1;FQNnWX0&4Nlq@KQ^T(x3Bo;~%821=2OUM@CWvizVIc0iUd;B$D@CgZcJDK2vBz?2&lUYm0S!`V*jrq`fAK zP3-zTj7oYyBDE)VsS!D|@YvFu8>1R@b+^wY2HGz z*py2P)~<*wJYX{rebg9H#QLUf@c&t;kCJ7KUcHr&6oH3CM^?RzFzt_ILfP*0a^YJ8 z>ExkMj0eDB4Q72LY*}+c7LgL6xgm~_#){%J)O=v03r9&@0C_MwC_#f^dhjl(7zlo9z^Gw#enE_;?1lSzprqy|iqUyvt@U6Mpl zl06k|wnUMd&7hY>IP?&iAZAUX0YE3qt=3Tp7-qA{pq8iriIA}G?f^QnQO7Sp9id~% zxJe{mGBNjUB+2ua0wAzG(ywac+j9I{!~4{ntY=}__~ zDL&yNFe=cDS-r#^Y_ubJA=|}*OUV(4Mk0h8stS_Y+>Z4o{qZVK#$GC6dDY56PyWE=4MpHRgeJHvqz1|qLm!EW$w!CNjJ>XEnrb+0dHXr ziw4IzNG%0#5)+Usgvl2Eo^XhOnsWzSTY!;*i9)tqUm65qaI;CSl7eqSE#xkdV)%F- zU=xH%<`nIm?G4epfu{&hIZ?uodBm&?YBqvH8CzkZb$CFO*g8ZV=*Y?MhigK1DR+4N zAf?;Uz2WYSLdZHsVG8DwZi-1lk1;$U2Z|OPn++aG!c|AYH5U|G*h&Xn3Ia?>Ndp7P zhcU!t!W0uQz+KBSK?p?z?6GGn`(FBXIey1GGFWnAmt8Uy*B25-DkMUH04@_;R0on6 z64aW-Gn_s~Pt3eMU3iqbQWjNQBo)d}xzWW3eY6XU8vlllhQ~_HQ=THfx z=Nv2N0Xb2b?^>E7(_0V3a8ys`HZ)85JkbGf#rcNRW-;GH-EGSWaqL@j~}?A0JGr)@j`{V z_<#v~#`04kD9~|AD0+|7?Qlnmbwtu-7Ot4RP5s`QrhYf&xI%u6+@Qr0aTb{*EO2E( zQ-UPY)k94rPxb>SHbut6So;A7MHpi0JN(PN#&dsg2*(9B6)2o^L#}O+n40{l>V3}X zP)AP$|Bk!wPg12-$5M{Y>D163q`NET-~(I%SG9>4-HNk5w_Y<5sN86V;9>ZwFc)l!sL@Cs_?& zDh27R(P7Oy@(QIB6=%v+K~UY7z8G1jH3O{VuhzqgBQsS`Ta7Xpc>-IgT0m<;*@HFq zIQ%LN=ZvGx)PGTVU6pdqUVO8ks)(ol4`e}c<<*2`iLnUEf}vPZKNNBE&q?Z)L3Y-k zwTikrk<>=XU6kl+>9L{lFsxMyB?l#4?Q?P`0r-miiQI2P!zQ;(O|(P`$t#5s63P>3 zWH1M{+@~}yI0;qJh%uqrGrFsD+N-ddGRrkl5-FvW>B`rKsRy8ytfgv9hzJtkp{0hA znzu^U+>M*=^&6Y?@!9-MjR%fZlz~%XcQ>F1aehG8L))_pN{0y*6OayBBpKhCdgw>Z zu{y1LvzcQ==>OK6p#-Mr!Qnf0GH5otyo6+BwH` zis{U#m%u?dB`8bmrBnxZ+-&UgSRZs^jafa`R~w18#FygkCYkm$$8xEZuA#a^sTm@Q z{|^gFT~s*DIp~`%SII0n6>v2`X9PPC>a;+`X%*&suF-|LblCRGLFYT1lXHyD^7NB* z$o4h;9Mwf>ThO26R6{4kw4URwxvzDM(2Ls%%|fw)lgZm3DQ2jcYaC7?wIm9#Am%l-a6To9;4* zXq#_}*bHKtWK5sK3U~79e7+GKbi1148u1Ku->RLfOK`NhS(z@}$#!MRMiH@d{zKF! zi3ODp#>J%HSX<3a8)Yej7!2WHH*jdoSVzMXSJ`05wi=r1q?Adw7LO6*(|D8$qmsbp zoR6Q?;3NnzJGbT9Kd1Iu&t{ zUC#kG+ouXkfWyG51;zgKepBXzud(Z|sujS5qT$?3yI4_j0@NvUf>nvE^yb-so8xPW zOf*~-qbL&dsml&o%bSZNskbMBt&1P1<3{w2*Weg%iZZ}8H~}PWFVNlUoq%xh%$r>F zl#wKq;v#^}s)hIR3tjh+T=M+|F0dGbrrccFlnS<%TE>Ok=16ITW>#G&k3#oeLauE< z6*AaXs>PsxgN@vo#~p>)u%iXQBKmOZXKzC-C}ouw6e}}hLju2b^mScEwx(N0f(=as zn2-=MQxQ}p4mmu?m2X8szPZvw$YPl_TYx0V_1i0;_}h3LM%NvLL zK$BI;eBI)J+0FGHV*mqHkc51!N!Kct0sYBL3E8@ig;mRstN<#+K2}(^scK9xd~@lO z8Oyyvf! zjbZM0OFcCsQ6~lnkadIIUa%|x=qNaKCn&W_C#6?ij{%!RO{%e>o@8R%sv*@Y8-Vp& z$#aBH^~+E$je~A1eg5a!3xY`o_y#H99QJBA1%uK>xv~$+BrXXRkyk(zc1(5zI{z@#1i@UYhz--jhp=F z6|En}?XJ#Z+piu?2zV_$G0V(TE>&T@Ve$%1PzB9qT;f-h1DrgX&mI-L$y2U`_3{x{ zNZavh_N#iJt7dt?clFz50tbD!72?&j+=R!p!I@!~={!tky|P?Se~}k+J{oCBzaHuT zr#Pk3t4F@wH{GArD>F^M|BpUTaej$1e>XBcTKrYH!akRkWo@U1U*Z4%*q_?}r7WLw WH^whF$1iV=U&fo|_J0Ze?0*3=cIE;A literal 0 HcmV?d00001 From 4f958056aeacc10d9490c17795090a0400aad5fa Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 16:02:37 +0100 Subject: [PATCH 03/39] add fields to eupdcorpreader --- backend/corpora/parliament/conftest.py | 62 ++-- backend/corpora/parliament/euparl.py | 340 +++++++----------- .../corpora/parliament/utils/rds_reader.py | 13 +- 3 files changed, 180 insertions(+), 235 deletions(-) diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index e0b1c7817..a6414dc6a 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -34,7 +34,7 @@ def parliament_corpora_settings(settings): settings.PP_CANADA_DATA = os.path.join(here, 'tests', 'data', 'canada') settings.PP_DENMARK_DATA = os.path.join(here, 'tests', 'data', 'denmark') settings.PP_DENMARK_NEW_DATA = os.path.join(here, 'tests', 'data', 'denmark-new') - settings.PP_EUPARL_DATA = os.path.join(here, 'tests', 'data', 'euparl', 'rdf') + settings.PP_EUPARL_DATA = os.path.join(here, 'tests', 'data', 'euparl', 'rds') settings.PP_FINLAND_DATA = os.path.join(here, 'tests', 'data', 'finland') settings.PP_FINLAND_OLD_DATA = os.path.join(here, 'tests', 'data', 'finland-old') settings.PP_FR_DATA = os.path.join(here, 'tests', 'data', 'france') @@ -685,38 +685,48 @@ def parliament_corpora_settings(settings): "name": "parliament-europe", "start": datetime(1999, 7, 20), "docs": [ + # EUPDCorp data { - "id": "1999-07-21-Speech-3-063", - "date": "1999-07-21", - "debate_id": "1999-07-21_AgendaItem_5", - "debate_title": "Statement by Mr Prodi, President-elect of the Commission", - "party": "Group for the Technical Coordination and Defence of Indipendent Groups and Members (TGI)", - "sequence": 15, - "speaker": "Francesco Enrico Speroni", - "speaker_country": "Italy", - "speech": """Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency.""", - "source_language": "Italian", - "url": "http://purl.org/linkedpolitics/eu/plenary/1999-07-21-Speech-3-063", + 'date': '1999-07-20', + 'debate_title': 'Genoptagelse af sessionen', + 'debate_id': 'CRE-5-1999-07-20-FNL', + 'speaker': 'Giorgio Napolitano', + 'party': 'IND', + 'party_full': 'Independent', + 'party_national': 'Democratici di Sinistra', + 'speaker_country': 'Italy', + 'speaker_gender': 'Male', + 'speaker_birth_year': 1925, + 'speaker_id': '1103', + 'speech_original': 'Dichiaro ripresa la sessione interrotta il 7 maggio 1999 e ' + "dichiaro aperta la seduta prevista all'articolo 10, paragrafo 3, " + "dell'Atto recante elezione dei rappresentanti al Parlamento " + "europeo a suffragio universale diretto nonché all'articolo 10, " + "paragrafo 3, del Regolamento del Parlamento. L'onorevole Crowley " + 'ha chiesto, pregiudizialmente, la parola.', + 'speech': 'I declare resumed the session adjourned on 7 May 1999 and open ' + 'the sitting provided for in Article 10 (3) of the Act electing ' + 'the representatives of the European Parliament by direct ' + "universal suffrage and in Article 10 (3) of Parliament's Rules " + 'of Procedure. Mr Crowley has asked for the floor on a point of ' + 'order.', + 'source_language': 'Italian', + 'sequence': 1, }, { - "id": "2017-07-06-Speech-4-146-000", - "date": "2017-07-06", - "debate_id": "2017-07-06_AgendaItem_13", - "debate_title": "Composition of committees and delegations", - "party": None, - "sequence": 2, - "source_language": "English", - "speaker": "Ashley Fox", - "speaker_country": "United Kingdom", - "speech": """Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?""", - "url": "http://www.europarl.europa.eu/plenary/EN/vod.html?mode=unit&vodLanguage=EN&startTime=20170706-12:02:01-324", + 'speaker': 'Brian Crowley', + 'speaker_country': 'Ireland', + 'sequence': 2, }, + {}, {}, {}, + # API data { "date": "2024-11-13", "debate_id": "MTG-PL-2024-11-13-PVCRE-ITM-17", "debate_title": "17. Fight against money laundering and terrorist financing: listing Russia as a high-risk third country in the EU (debate)", "id": "MTG-PL-2024-11-13-OTH-2017005042457", - "party": "European Conservatives and Reformists Group", + "party": 'ECR', + 'party_full': 'European Conservatives and Reformists', "party_id": "7037", "source_language": "English", "sequence": 1, @@ -724,9 +734,9 @@ def parliament_corpora_settings(settings): "speaker_country": "Latvia", "speaker_id": "28615", "speech": "Thank you, Commissioner McGuinness, and I would also like to thank you for your work on the AML package and many other issues, also for today's issues. Thank you very much.", - }, + } ], - "n_documents": 3, + "n_documents": 6, }, { 'name': 'parliament-sweden-swerik', diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 619439152..ea97e2aea 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -1,133 +1,34 @@ from datetime import datetime, timedelta from functools import cache -from itertools import chain import logging import os -from typing import Optional, Tuple, Union +from typing import Optional from bs4 import BeautifulSoup from django.conf import settings from langcodes import standardize_tag, Language import requests -from rdflib import Graph, Namespace, URIRef -from rdflib.namespace import DCTERMS, FOAF, RDFS, RDF as RDFNS -from ianalyzer_readers.extract import Backup, Combined, JSON, Metadata, RDF, Pass +from ianalyzer_readers.extract import Combined, JSON, Metadata, Pass, CSV +from ianalyzer_readers.readers.core import Field -from addcorpus.es_mappings import keyword_mapping +from addcorpus.es_mappings import keyword_mapping, main_content_mapping from addcorpus.python_corpora.corpus import ( FieldDefinition, JSONCorpusDefinition, - RDFCorpusDefinition, ) from addcorpus.python_corpora.filters import MultipleChoiceFilter from corpora.parliament.parliament import Parliament import corpora.parliament.utils.field_defaults as field_defaults from corpora.utils.constants import document_context +from corpora.parliament.utils.rds_reader import RDSReader logger = logging.getLogger('indexing') -EVENTS_METADATA = 'Events_and_structure.ttl' -MP_METADATA = 'MembersOfParliament_background.ttl' -SPEECHES = 'English.ttl' - -# Namespaces of Linked Politics (NB: the purl links resolve to dead sites) -LP_EU = Namespace('http://purl.org/linkedpolitics/eu/plenary/') -LPV_EU = Namespace('http://purl.org/linkedpolitics/vocabulary/eu/plenary/') -LP = Namespace('http://purl.org/linkedpolitics/') -LPV = Namespace('http://purl.org/linkedpolitics/vocabulary/') - -def add_speaker_metadata(filename: str) -> dict: - """Parse all relevant metadata out of MembersOfParliament ttl to dict""" - speaker_dict = {} - speaker_graph = Graph() - speaker_graph.parse(filename) - speaker_subjects = speaker_graph.subjects(object=LPV.MemberOfParliament) - for speaker in speaker_subjects: - try: - name = speaker_graph.value(speaker, FOAF.name).value - except AttributeError: - # We cannot find the name of the speaker subject - continue - country_node = speaker_graph.value(speaker, LPV.countryOfRepresentation) - country_name = speaker_graph.value(country_node, RDFS.label).value - party_list = [] - speaker_functions = speaker_graph.objects(speaker, LPV.politicalFunction) - for function in speaker_functions: - function_type = speaker_graph.value(function, LPV.institution) - if speaker_graph.value(function_type, RDFNS.type) == LPV.EUParty: - party_labels = list(speaker_graph.objects(function_type, RDFS.label)) - party_acronym = min(party_labels, key=len) - party_name = max(party_labels, key=len) - date_start = speaker_graph.value(function, LPV.beginning) - date_end = speaker_graph.value(function, LPV.end) - party_list.append({ - 'party_acronym': party_acronym, - 'party_name': party_name, - 'date_start': date_start.value, - 'date_end': date_end.value - }) - speaker_dict.update({speaker: { - 'name': name, - 'country': country_name, - 'parties': party_list - } - }) - return speaker_dict - - -def get_identifier(input_string: str) -> str: - return input_string.split('/')[-1] - def language_name(lang_code: str) -> str: return Language.make(language=standardize_tag(lang_code)).display_name() -def get_speaker(input_data: Tuple[URIRef, dict]) -> str: - (speaker, speaker_dict) = input_data - return speaker_dict.get(speaker).get('name') - - -def get_speaker_country(input_data: Tuple[URIRef, dict]) -> str: - (speaker, speaker_dict) = input_data - return speaker_dict.get(speaker).get('country') - - -def get_speaker_party(input_data: Tuple[str, datetime, dict]) -> str: - ''' look up the which EU party the speaker was part of at the date of their speech ''' - (speaker, date, party_data) = input_data - party_list = party_data.get(speaker).get('parties') - return next( - ( - f"{p['party_name'].value} ({p['party_acronym'].value})" - for p in party_list - if (date >= p["date_start"] and date <= p["date_end"]) - ), - None, - ) - - -def get_speech_index(input_data: Tuple[str, list]) -> int: - ''' find index of speech in array of debate parts ''' - speech, speeches = input_data - if not speech: - return None - return speeches.index(speech) + 1 - - -def get_speech_text(input_string: str) -> str: - ''' remove leading language information, e.g., `(IT)`''' - return input_string.split(') ')[-1] - - -def get_uri(input_data: Union[URIRef, str]) -> str: - ''' convert input from URIRef to string ''' - try: - return input_data.n3().strip('<>') - except: - return input_data - - class ParliamentEurope(Parliament): title = 'People & Parliament (European Parliament)' description = "Speeches from the European Parliament (EP)" @@ -144,13 +45,13 @@ class ParliamentEurope(Parliament): @property def subcorpora(self): return [ - ParliamentEuropeFromRDF(), + EUPDCorpReader(), ParliamentEuropeFromAPI(), ] - def sources(self, start, end): + def sources(self, **kwargs): for i, subcorpus in enumerate(self.subcorpora): - for source in subcorpus.sources(start, end): + for source in subcorpus.sources(**kwargs): filename, metadata = source metadata["subcorpus"] = i yield filename, metadata @@ -169,7 +70,13 @@ def source2dicts(self, source, **kwargs): debate_title = field_defaults.debate_title() date = field_defaults.date(min_date, max_date) party = field_defaults.party() + party_full = field_defaults.party_full() party_id = field_defaults.party_id() + party_national = FieldDefinition( + name='party_national', + display_name='National party', + es_mapping=keyword_mapping(enable_full_text_search=True), + ) sequence = field_defaults.sequence() source_language = FieldDefinition( name='source_language', @@ -196,7 +103,16 @@ def source2dicts(self, source, **kwargs): ), visualizations=['resultscount', 'termfrequency'], ) + speaker_gender = field_defaults.speaker_gender() + speaker_birth_year = field_defaults.speaker_birth_year() speech = field_defaults.speech(language='en') + speech.description = 'Speech translated to English' + speech_original = FieldDefinition( + name='speech_original', + display_name='Original speech', + description='Speech in the original language', + es_mapping=main_content_mapping(), + ) speech_id = field_defaults.speech_id() url = field_defaults.url() @@ -206,13 +122,18 @@ def __init__(self): self.debate_id, self.debate_title, self.party, + self.party_full, self.party_id, + self.party_national, self.sequence, self.source_language, self.speaker, self.speaker_country, + self.speaker_gender, + self.speaker_birth_year, self.speaker_id, self.speech, + self.speech_original, self.speech_id, self.url, ] @@ -300,7 +221,25 @@ def api_get_party_name_from_id(party_id: str) -> str: ) if party_response.status_code != 200: return None - return party_response.json().get('data')[0].get('prefLabel').get('en') + return party_response.json().get('data')[0].get('label') + + +def _api_get_party_full_name(data) -> Optional[str]: + party_id = api_get_party_id(data) + return _api_get_party_full_name_from_id(party_id) + + +@cache +def _api_get_party_full_name_from_id(party_id: str) -> str: + if not party_id: + return None + party_response = requests.get( + f'https://data.europarl.europa.eu/api/v2/corporate-bodies/{party_id}?format=application%2Fld%2Bjson&language=en' + ) + if party_response.status_code != 200: + return None + return party_response.json().get('data')[0].get('altLabel').get('en') + def first(values): if len(values): @@ -376,6 +315,16 @@ def sources(self, start, end, **kwargs): transform=api_get_party_name, ) + party_full = field_defaults.party_full() + party_full.extractor = Combined( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + Metadata('date'), + transform=_api_get_party_full_name, + ) + party_id = field_defaults.party_id() party_id.extractor = Combined( JSON( @@ -436,6 +385,7 @@ def sources(self, start, end, **kwargs): debate_id, debate_title, party, + party_full, party_id, sequence, source_language, @@ -447,106 +397,90 @@ def sources(self, start, end, **kwargs): ] -class ParliamentEuropeFromRDF(RDFCorpusDefinition): - """ - Speeches of the European parliament, originally in or translated to English, - provided as Linked Open Data by the "Talk of Europe" project - """ - - min_date = datetime(year=1999, month=7, day=20) - max_date = datetime(year=2017, month=7, day=6) +def _to_int(value) -> Optional[int]: + if value or value == 0: + return int(value) +class EUPDCorpReader(RDSReader): data_directory = settings.PP_EUPARL_DATA - def sources(self, start, end, **kwargs): - metadata = { - "speakers": add_speaker_metadata( - os.path.join(self.data_directory, MP_METADATA) - ) - } - yield os.path.join(self.data_directory, SPEECHES), metadata - - def document_subjects(self, graph: Graph): - """return all subjects which have either translated or spoken text""" - return chain( - graph.subjects(predicate=LPV.translatedText), - graph.subjects(predicate=LPV.spokenText), - ) - - def data_from_file(self, filename: str) -> Graph: - '''we combine the graphs in place, to keep memory load low''' - graph = Graph() - graph.parse(filename) - graph.parse(os.path.join(self.data_directory, EVENTS_METADATA)) - return graph - - date = field_defaults.date(min_date, max_date) - date.extractor = RDF(DCTERMS.date, transform=lambda x: x.strftime('%Y-%m-%d')) - - debate_id = field_defaults.debate_id() - debate_id.extractor = RDF(DCTERMS.isPartOf, transform=get_identifier) - - debate_title = field_defaults.debate_title() - debate_title.extractor = RDF(DCTERMS.isPartOf, DCTERMS.title) + def sources(self, **kwargs): + for filename in os.listdir(self.data_directory): + if filename.lower().endswith('.rds'): + yield os.path.join(self.data_directory, filename), {} - party = field_defaults.party() - party.extractor = Combined( - RDF(LPV.speaker), - RDF(DCTERMS.date), - Metadata('speakers'), - transform=get_speaker_party, - ) - - sequence = field_defaults.sequence() - sequence.extractor = Combined( - RDF(), - RDF(DCTERMS.isPartOf, DCTERMS.hasPart, multiple=True), - transform=get_speech_index, - ) - - source_language = field_defaults.language() - source_language.name = 'source_language' - source_language.extractor = RDF(DCTERMS.language, transform=language_name) - - speaker = field_defaults.speaker() - speaker.extractor = Combined( - RDF(LPV.speaker), Metadata('speakers'), transform=get_speaker - ) - - speaker_country = FieldDefinition( - name='speaker_country', - extractor=Combined( - RDF(LPV.speaker), Metadata('speakers'), transform=get_speaker_country + fields = [ + Field( + name='date', + extractor=CSV('date'), ), - ) - - speech = field_defaults.speech(language='en') - speech.extractor = Backup( - RDF( - LPV.spokenText, + Field( + name='debate_id', + extractor=CSV('file'), ), - RDF( - LPV.translatedText, + Field( + name='debate_title', + extractor=CSV('agenda'), + ), + Field( + name='source_language', + extractor=CSV( + 'language', + transform=lambda value: language_name(value.lower()), + ), + ), + Field( + name='party', + extractor=CSV('epg_short'), + ), + Field( + name='party_full', + extractor=CSV('epg_long'), + ), + Field( + name='party_national', + extractor=CSV('party_name'), + ), + Field( + name='sequence', + extractor=CSV('doc_id', transform=int), + ), + Field( + name='speech', + extractor=CSV('speech_en'), + ), + Field( + name='speech_original', + extractor=CSV('speech'), + ), + Field( + name='speaker', + extractor=Combined( + CSV('firstname'), + CSV('lastname'), + transform=' '.join, + ) + ), + Field( + name='speaker_id', + extractor=Pass( + CSV('mepid', transform=_to_int), + transform=str, + ), + ), + Field( + name='speaker_gender', + extractor=CSV( + 'gender', + transform=lambda value: 'Male' if value else 'Female', + ) + ), + Field( + name='speaker_birth_year', + extractor=CSV('birth_year', transform=_to_int) + ), + Field( + name='speaker_country', + extractor=CSV('nationality'), ), - transform=get_speech_text, - ) - - speech_id = field_defaults.speech_id() - speech_id.extractor = RDF(transform=get_identifier) - - url = field_defaults.url() - url.extractor = Backup(RDF(LPV.videoURI, transform=get_uri), RDF(transform=get_uri)) - - fields = [ - date, - debate_id, - debate_title, - party, - sequence, - source_language, - speaker, - speaker_country, - speech, - speech_id, - url, ] diff --git a/backend/corpora/parliament/utils/rds_reader.py b/backend/corpora/parliament/utils/rds_reader.py index a2ad5735b..08f1122a8 100644 --- a/backend/corpora/parliament/utils/rds_reader.py +++ b/backend/corpora/parliament/utils/rds_reader.py @@ -9,13 +9,14 @@ class RDSReader(Reader): def data_from_file(self, path) -> Iterable[Dict]: result = pyreadr.read_r(path) - data: pandas.DataFrame = result['data'] + for value in result.values(): + data: pandas.DataFrame = value - for _, row in data.iterrows(): - yield { - index: value - for index, value in row.items() - } + for _, row in data.iterrows(): + yield { + index: value + for index, value in row.items() + } def iterate_data(self, data: Iterable[Dict], metadata): for row in data: From 21d69a95db8e5001c3d80bb952b4f7d9643ea725 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 18:08:34 +0100 Subject: [PATCH 04/39] fix name formatting, add source archive field --- backend/corpora/parliament/conftest.py | 6 ++++-- backend/corpora/parliament/euparl.py | 29 +++++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index a6414dc6a..65e27f927 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -710,8 +710,9 @@ def parliament_corpora_settings(settings): "universal suffrage and in Article 10 (3) of Parliament's Rules " 'of Procedure. Mr Crowley has asked for the floor on a point of ' 'order.', - 'source_language': 'Italian', + 'language': 'Italian', 'sequence': 1, + 'source_archive': 'EUPDCorp', }, { 'speaker': 'Brian Crowley', @@ -728,12 +729,13 @@ def parliament_corpora_settings(settings): "party": 'ECR', 'party_full': 'European Conservatives and Reformists', "party_id": "7037", - "source_language": "English", + "language": "English", "sequence": 1, "speaker": "Roberts Zīle", "speaker_country": "Latvia", "speaker_id": "28615", "speech": "Thank you, Commissioner McGuinness, and I would also like to thank you for your work on the AML package and many other issues, also for today's issues. Thank you very much.", + 'source_archive': 'Europarl Open Data', } ], "n_documents": 6, diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index ea97e2aea..9e696bb88 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -8,7 +8,7 @@ from django.conf import settings from langcodes import standardize_tag, Language import requests -from ianalyzer_readers.extract import Combined, JSON, Metadata, Pass, CSV +from ianalyzer_readers.extract import Combined, JSON, Metadata, Pass, CSV, Constant from ianalyzer_readers.readers.core import Field from addcorpus.es_mappings import keyword_mapping, main_content_mapping @@ -51,6 +51,7 @@ def subcorpora(self): def sources(self, **kwargs): for i, subcorpus in enumerate(self.subcorpora): + logger.info(f'Extracting subcorpus: {subcorpus.__class__.__name__}') for source in subcorpus.sources(**kwargs): filename, metadata = source metadata["subcorpus"] = i @@ -79,8 +80,8 @@ def source2dicts(self, source, **kwargs): ) sequence = field_defaults.sequence() source_language = FieldDefinition( - name='source_language', - display_name='Source language', + name='language', + display_name='Language', description='Original language of the speech', es_mapping=keyword_mapping(), search_filter=MultipleChoiceFilter( @@ -115,6 +116,7 @@ def source2dicts(self, source, **kwargs): ) speech_id = field_defaults.speech_id() url = field_defaults.url() + source_archive = field_defaults.source_archive() def __init__(self): self.fields = [ @@ -136,6 +138,7 @@ def __init__(self): self.speech_original, self.speech_id, self.url, + self.source_archive, ] @@ -263,7 +266,7 @@ class ParliamentEuropeFromAPI(JSONCorpusDefinition): ['data', 'activity_id'], ] - def sources(self, start, end, **kwargs): + def sources(self, **kwargs): date = self.min_date while date < self.max_date: date += timedelta(days=1) @@ -339,7 +342,6 @@ def sources(self, start, end, **kwargs): sequence.extractor = Metadata('sequence') source_language = field_defaults.language() - source_language.name = 'source_language' source_language.extractor = JSON("originalLanguage", transform=api_get_language) speaker = field_defaults.speaker() @@ -380,6 +382,9 @@ def sources(self, start, end, **kwargs): speech_id = field_defaults.speech_id() speech_id.extractor = JSON("data.activity_id") + source_archive = field_defaults.source_archive() + source_archive.extractor = Constant('Europarl Open Data') + fields = [ date, debate_id, @@ -394,6 +399,7 @@ def sources(self, start, end, **kwargs): speaker_id, speech, speech_id, + source_archive, ] @@ -401,6 +407,11 @@ def _to_int(value) -> Optional[int]: if value or value == 0: return int(value) +def _format_name(values) -> str: + return ' '.join( + value for value in filter(None, values) + ) + class EUPDCorpReader(RDSReader): data_directory = settings.PP_EUPARL_DATA @@ -423,7 +434,7 @@ def sources(self, **kwargs): extractor=CSV('agenda'), ), Field( - name='source_language', + name='language', extractor=CSV( 'language', transform=lambda value: language_name(value.lower()), @@ -458,7 +469,7 @@ def sources(self, **kwargs): extractor=Combined( CSV('firstname'), CSV('lastname'), - transform=' '.join, + transform=_format_name, ) ), Field( @@ -483,4 +494,8 @@ def sources(self, **kwargs): name='speaker_country', extractor=CSV('nationality'), ), + Field( + name='source_archive', + extractor=Constant('EUPDCorp'), + ) ] From 584b356fc502402bd04e8a38e5d2c22f1a555b72 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 18:37:45 +0100 Subject: [PATCH 05/39] add original language field --- backend/corpora/parliament/euparl.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 9e696bb88..a7e674ac8 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -79,17 +79,10 @@ def source2dicts(self, source, **kwargs): es_mapping=keyword_mapping(enable_full_text_search=True), ) sequence = field_defaults.sequence() - source_language = FieldDefinition( - name='language', - display_name='Language', - description='Original language of the speech', - es_mapping=keyword_mapping(), - search_filter=MultipleChoiceFilter( - description='Search only in speeches in the selected original languages', - option_count=50, - ), - visualizations=['resultscount', 'termfrequency'], - ) + original_language = field_defaults.language() + original_language.name = 'original_language' + original_language.display_name='Original language' + original_language.description = 'Original language of the speech' speaker = field_defaults.speaker() speaker_id = field_defaults.speaker_id() @@ -123,12 +116,12 @@ def __init__(self): self.date, self.debate_id, self.debate_title, + self.original_language, self.party, self.party_full, self.party_id, self.party_national, self.sequence, - self.source_language, self.speaker, self.speaker_country, self.speaker_gender, @@ -341,8 +334,9 @@ def sources(self, **kwargs): sequence = field_defaults.sequence() sequence.extractor = Metadata('sequence') - source_language = field_defaults.language() - source_language.extractor = JSON("originalLanguage", transform=api_get_language) + original_language = field_defaults.language() + original_language.name = 'original_language' + original_language.extractor = JSON("originalLanguage", transform=api_get_language) speaker = field_defaults.speaker() speaker.extractor = Pass( @@ -393,7 +387,7 @@ def sources(self, **kwargs): party_full, party_id, sequence, - source_language, + original_language, speaker, speaker_country, speaker_id, @@ -434,7 +428,7 @@ def sources(self, **kwargs): extractor=CSV('agenda'), ), Field( - name='language', + name='original_language', extractor=CSV( 'language', transform=lambda value: language_name(value.lower()), From fb6549e0d25ae16bab3711f1c20271b75a4bf507 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 12 Jan 2026 18:59:51 +0100 Subject: [PATCH 06/39] correct xml parser --- backend/corpora/parliament/euparl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index a7e674ac8..3503244c8 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -136,7 +136,7 @@ def __init__(self): def api_convert_xml(speech_xml: str) -> str: - speech_soup = BeautifulSoup(speech_xml, 'lxml') + speech_soup = BeautifulSoup(speech_xml, 'lxml-xml') return speech_soup.find('speech').find('p').text @@ -154,7 +154,7 @@ def api_get_preflabel(url: str) -> Optional[str]: response = requests.get(url) if response.status_code != 200: return None - soup = BeautifulSoup(response.content, 'lxml') + soup = BeautifulSoup(response.content, 'lxml-xml') return soup.find('skos:preflabel', {'xml:lang': 'en'}).text From 35636a0be6970763dd7384cf6e36d577511b4e42 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 15:23:09 +0100 Subject: [PATCH 07/39] filter speeches without speaker --- backend/corpora/parliament/euparl.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 3503244c8..0a8cc775e 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -292,6 +292,14 @@ def sources(self, **kwargs): metadata['sequence'] = sequence_in_debate yield speech_response, metadata + def iterate_data(self, data: Dict, metadata): + speeches_with_speaker = [ + item for item in data['data'] + if 'had_participation' in item + ] + filtered_data = data | { 'data': speeches_with_speaker} + return super().iterate_data(filtered_data, metadata) + debate_id = field_defaults.debate_id() debate_id.extractor = Metadata('debate_id') From 323db8f2f49eda2ae3b96acad035b3bd29ee6f20 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 15:34:09 +0100 Subject: [PATCH 08/39] catch if no memberships --- backend/corpora/parliament/euparl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 0a8cc775e..34e47bd56 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -189,7 +189,7 @@ def api_get_speaker_name(participant: str) -> str: def api_get_party_id(data) -> dict: participant, date = data speaker_metadata = api_get_speaker_info(participant) - memberships = speaker_metadata.get('hasMembership') + memberships = speaker_metadata.get('hasMembership') or [] for membership in memberships: if ( membership.get('membershipClassification') From 57a51b8a3f3ea55499923bead4aebdd3cddb2c62 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 15:37:56 +0100 Subject: [PATCH 09/39] fix when no response for person metadata --- backend/corpora/parliament/euparl.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 34e47bd56..3f7fb5f62 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -166,7 +166,7 @@ def api_get_speaker_info(participant: str) -> dict: f'https://data.europarl.europa.eu/api/v2/meps/{speaker_id}?format=application%2Fld%2Bjson' ) if not speaker_response.status_code == 200: - logger.warning(f"No response for {speaker_id}") + logger.warning(f"No response for person {speaker_id}") return {} else: return speaker_response.json().get('data')[0] @@ -175,14 +175,16 @@ def api_get_speaker_info(participant: str) -> dict: def api_get_speaker_country(participant: str) -> Optional[str]: speaker_metadata = api_get_speaker_info(participant) citizenship = speaker_metadata.get('citizenship') - return api_get_preflabel(citizenship) + if citizenship: + return api_get_preflabel(citizenship) def api_get_speaker_name(participant: str) -> str: speaker_metadata = api_get_speaker_info(participant) given_name = speaker_metadata.get('givenName') family_name = speaker_metadata.get('familyName') - return f'{given_name} {family_name}' + if given_name or family_name: + return f'{given_name} {family_name}' @cache From 32250e676657ebd11ddf35755630a47f3866f8a6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 15:59:35 +0100 Subject: [PATCH 10/39] fix multi-paragraph text --- backend/corpora/parliament/euparl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 3f7fb5f62..80d5d7382 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -137,7 +137,8 @@ def __init__(self): def api_convert_xml(speech_xml: str) -> str: speech_soup = BeautifulSoup(speech_xml, 'lxml-xml') - return speech_soup.find('speech').find('p').text + paragraphs = speech_soup.find('speech').find_all('p') + return '\n\n'.join(p.text for p in paragraphs) def api_get_language(languages: list[str]) -> str: From 1ffc09b8d285456e86dc69ac36dddd349c3cb78c Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 16:49:23 +0100 Subject: [PATCH 11/39] fix no tag --- backend/corpora/parliament/euparl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 80d5d7382..0bd24ea1c 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -52,6 +52,7 @@ def subcorpora(self): def sources(self, **kwargs): for i, subcorpus in enumerate(self.subcorpora): logger.info(f'Extracting subcorpus: {subcorpus.__class__.__name__}') + for source in subcorpus.sources(**kwargs): filename, metadata = source metadata["subcorpus"] = i @@ -137,7 +138,7 @@ def __init__(self): def api_convert_xml(speech_xml: str) -> str: speech_soup = BeautifulSoup(speech_xml, 'lxml-xml') - paragraphs = speech_soup.find('speech').find_all('p') + paragraphs = speech_soup.find_all('p') return '\n\n'.join(p.text for p in paragraphs) From 8fe848ffab5a9ed7f457a338da2650f86004c968 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 17:05:02 +0100 Subject: [PATCH 12/39] consistent party names --- backend/corpora/parliament/euparl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 0bd24ea1c..72fc6bf62 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -211,6 +211,12 @@ def api_get_party_name(data) -> Optional[str]: party_id = api_get_party_id(data) return api_get_party_name_from_id(party_id) +_party_name_replacements = { + 'The Left': 'GUENGL', + 'Verts/ALE': 'GEFA', + 'S&D': 'SOCPESPASD', +} +'Replaces some party labels with the ones used in the 1999-2024 datasets' @cache def api_get_party_name_from_id(party_id: str) -> str: @@ -221,7 +227,8 @@ def api_get_party_name_from_id(party_id: str) -> str: ) if party_response.status_code != 200: return None - return party_response.json().get('data')[0].get('label') + label = party_response.json().get('data')[0].get('label') + return _party_name_replacements.get(label, label) def _api_get_party_full_name(data) -> Optional[str]: From 1176efaa581b88d94981db49cb7a7ff3d17a5c51 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 17:20:02 +0100 Subject: [PATCH 13/39] cleaner code --- backend/corpora/parliament/euparl.py | 204 +++++++++++++-------------- 1 file changed, 100 insertions(+), 104 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 72fc6bf62..233ea7149 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -10,12 +10,10 @@ import requests from ianalyzer_readers.extract import Combined, JSON, Metadata, Pass, CSV, Constant from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.readers.json import JSONReader from addcorpus.es_mappings import keyword_mapping, main_content_mapping -from addcorpus.python_corpora.corpus import ( - FieldDefinition, - JSONCorpusDefinition, -) +from addcorpus.python_corpora.corpus import FieldDefinition from addcorpus.python_corpora.filters import MultipleChoiceFilter from corpora.parliament.parliament import Parliament import corpora.parliament.utils.field_defaults as field_defaults @@ -110,7 +108,11 @@ def source2dicts(self, source, **kwargs): ) speech_id = field_defaults.speech_id() url = field_defaults.url() - source_archive = field_defaults.source_archive() + source_archive = FieldDefinition( + name='source_archive', + display_name='Source archive', + description='Source dataset for this document', + ) def __init__(self): self.fields = [ @@ -252,7 +254,7 @@ def first(values): if len(values): return values[0] -class ParliamentEuropeFromAPI(JSONCorpusDefinition): +class ParliamentEuropeFromAPI(JSONReader): """ Speeches of the European parliament, originally in or translated to English, provided through the Europarl Open Data API @@ -261,9 +263,6 @@ class ParliamentEuropeFromAPI(JSONCorpusDefinition): min_date = datetime(year=2024, month=7, day=7) max_date = datetime.now() - # Variables to hold interim metadata - speaker_metadata = {} - party_metadata = {} record_path = ['data', 'recorded_in_a_realization_of'] meta = [ ['data', 'had_participation', 'had_participant_person'], @@ -311,108 +310,105 @@ def iterate_data(self, data: Dict, metadata): filtered_data = data | { 'data': speeches_with_speaker} return super().iterate_data(filtered_data, metadata) - debate_id = field_defaults.debate_id() - debate_id.extractor = Metadata('debate_id') - - debate_title = field_defaults.debate_title() - debate_title.extractor = Metadata('debate_title') - - date = field_defaults.date(min_date, max_date) - date.extractor = Metadata('date') - - party = field_defaults.party() - party.extractor = Combined( - JSON( - "data.had_participation.had_participant_person", - transform=first, + fields = [ + Field( + name='debate_id', + extractor=Metadata('debate_id'), ), - Metadata('date'), - transform=api_get_party_name, - ) - - party_full = field_defaults.party_full() - party_full.extractor = Combined( - JSON( - "data.had_participation.had_participant_person", - transform=first, + Field( + name='debate_title', + extractor=Metadata('debate_title'), ), - Metadata('date'), - transform=_api_get_party_full_name, - ) - - party_id = field_defaults.party_id() - party_id.extractor = Combined( - JSON( - "data.had_participation.had_participant_person", - transform=first + Field( + name='date', + extractor=Metadata('date') ), - Metadata('date'), - transform=api_get_party_id, - ) - - sequence = field_defaults.sequence() - sequence.extractor = Metadata('sequence') - - original_language = field_defaults.language() - original_language.name = 'original_language' - original_language.extractor = JSON("originalLanguage", transform=api_get_language) - - speaker = field_defaults.speaker() - speaker.extractor = Pass( - JSON( - "data.had_participation.had_participant_person", - transform=first, + Field( + name='party', + extractor=Combined( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + Metadata('date'), + transform=api_get_party_name, + ) ), - transform=api_get_speaker_name, - ) - - speaker_country = FieldDefinition( - name='speaker_country', - extractor=Pass( - JSON( - "data.had_participation.had_participant_person", - transform=first, + Field( + name='party_full', + extractor=Combined( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + Metadata('date'), + transform=_api_get_party_full_name, + ) + ), + Field( + name='party_id', + extractor=Combined( + JSON( + "data.had_participation.had_participant_person", + transform=first + ), + Metadata('date'), + transform=api_get_party_id, + ) + ), + Field( + name='sequence', + extractor=Metadata('sequence') + ), + Field( + name='original_language', + extractor=JSON("originalLanguage", transform=api_get_language) + ), + Field( + name='speaker', + extractor=Pass( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + transform=api_get_speaker_name, + ) + ), + Field( + name='speaker_country', + extractor=Pass( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + transform=api_get_speaker_country, ), - transform=api_get_speaker_country, ), - ) - - speaker_id = field_defaults.speaker_id() - speaker_id.extractor = Pass( - JSON( - "data.had_participation.had_participant_person", - transform=first, + Field( + name='speaker_id', + extractor=Pass( + JSON( + "data.had_participation.had_participant_person", + transform=first, + ), + transform=api_get_speaker_id, + ) + ), + Field( + name='speech', + extractor=JSON( + "api:xmlFragment.en", + transform=api_convert_xml, + ) + ), + Field( + name='id', + extractor=JSON("data.activity_id") + ), + Field( + name='source_archive', + extractor=Constant('European Parliament Open Data API') ), - transform=api_get_speaker_id, - ) - - speech = field_defaults.speech() - speech.extractor = JSON( - "api:xmlFragment.en", - transform=api_convert_xml, - ) - - speech_id = field_defaults.speech_id() - speech_id.extractor = JSON("data.activity_id") - - source_archive = field_defaults.source_archive() - source_archive.extractor = Constant('Europarl Open Data') - - fields = [ - date, - debate_id, - debate_title, - party, - party_full, - party_id, - sequence, - original_language, - speaker, - speaker_country, - speaker_id, - speech, - speech_id, - source_archive, ] From 947bca46700d5c4a56783d371027e4135f808401 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 17:36:53 +0100 Subject: [PATCH 14/39] filter records with no speech content --- backend/corpora/parliament/euparl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 233ea7149..63ff000eb 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -308,7 +308,12 @@ def iterate_data(self, data: Dict, metadata): if 'had_participation' in item ] filtered_data = data | { 'data': speeches_with_speaker} - return super().iterate_data(filtered_data, metadata) + records = list(super().iterate_data(filtered_data, metadata)) + filtered_records = [ + record for record in records + if record['data'].get('api:xmlFragment.en') + ] + return filtered_records fields = [ Field( From 4253526f518f93fd95302f9d0438843859f09479 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 18:14:15 +0100 Subject: [PATCH 15/39] add original language code --- backend/corpora/parliament/euparl.py | 45 ++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 63ff000eb..eb8417553 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -2,7 +2,7 @@ from functools import cache import logging import os -from typing import Optional +from typing import Optional, Dict, List, Tuple from bs4 import BeautifulSoup from django.conf import settings @@ -39,6 +39,7 @@ class ParliamentEurope(Parliament): image = 'euparl.jpeg' min_date = datetime(year=1999, month=7, day=20) max_date = datetime.now() + language_field = 'original_language_code' @property def subcorpora(self): @@ -83,6 +84,12 @@ def source2dicts(self, source, **kwargs): original_language.display_name='Original language' original_language.description = 'Original language of the speech' + original_language_code = FieldDefinition( + name='original_language_code', + es_mapping=keyword_mapping(), + hidden=True, + ) + speaker = field_defaults.speaker() speaker_id = field_defaults.speaker_id() speaker_country = FieldDefinition( @@ -98,14 +105,20 @@ def source2dicts(self, source, **kwargs): ) speaker_gender = field_defaults.speaker_gender() speaker_birth_year = field_defaults.speaker_birth_year() + speech = field_defaults.speech(language='en') speech.description = 'Speech translated to English' + speech.language = 'en' + speech_original = FieldDefinition( name='speech_original', display_name='Original speech', description='Speech in the original language', es_mapping=main_content_mapping(), + display_type='text_content', + language='dynamic', ) + speech_id = field_defaults.speech_id() url = field_defaults.url() source_archive = FieldDefinition( @@ -120,6 +133,7 @@ def __init__(self): self.debate_id, self.debate_title, self.original_language, + self.original_language_code, self.party, self.party_full, self.party_id, @@ -144,10 +158,23 @@ def api_convert_xml(speech_xml: str) -> str: return '\n\n'.join(p.text for p in paragraphs) -def api_get_language(languages: list[str]) -> str: - language = language_name(languages[0].split('/')[-1]) - return language +def api_get_language(languages: List[str]) -> Optional[str]: + label, _ = _api_get_language_data(languages[0]) + return label +def _api_get_language_code(languages: List[str]): + _, code = _api_get_language_data(languages[0]) + return code + +@cache +def _api_get_language_data(url: str) -> Tuple[str, str] | Tuple[None, None]: + response = requests.get(url) + if response.status_code != 200: + return None, None + soup = BeautifulSoup(response.content, 'lxml-xml') + label = soup.find('skos:prefLabel', {'xml:lang': 'en'}).text + code = soup.find('skos:notation', {'rdf:datatype': 'http://publications.europa.eu/ontology/euvoc#ISO_639_1'}).text + return label, code def api_get_speaker_id(participant: str) -> str: return participant.split('/')[-1] @@ -294,7 +321,7 @@ def sources(self, **kwargs): for speech in event.get('consists_of'): speech_id = speech.split("/")[-1] speech_response = requests.get( - f'https://data.europarl.europa.eu/api/v2/speeches/{speech_id}?include-output=xml_fragment&language=en&format=application%2Fld%2Bjson' + f'https://data.europarl.europa.eu/api/v2/speeches/{speech_id}?include-output=xml_fragment&format=application%2Fld%2Bjson' ) if speech_response.status_code != 200: continue @@ -369,6 +396,10 @@ def iterate_data(self, data: Dict, metadata): name='original_language', extractor=JSON("originalLanguage", transform=api_get_language) ), + Field( + name='original_language_code', + extractor=JSON('originalLanguage', transform=_api_get_language_code) + ), Field( name='speaker', extractor=Pass( @@ -454,6 +485,10 @@ def sources(self, **kwargs): transform=lambda value: language_name(value.lower()), ), ), + Field( + name='original_language_code', + extractor=CSV('language', transform=lambda value: value.lower()) + ), Field( name='party', extractor=CSV('epg_short'), From 193a8543c7eba0e5071291bbcf6686a72ec576b1 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 13 Jan 2026 18:32:07 +0100 Subject: [PATCH 16/39] include original speech in APi corpus --- backend/corpora/parliament/euparl.py | 36 +++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index eb8417553..204da7a10 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -2,7 +2,7 @@ from functools import cache import logging import os -from typing import Optional, Dict, List, Tuple +from typing import Optional, Dict, List, Tuple, Union from bs4 import BeautifulSoup from django.conf import settings @@ -277,10 +277,31 @@ def _api_get_party_full_name_from_id(party_id: str) -> str: return party_response.json().get('data')[0].get('altLabel').get('en') +def _api_speech_key(language_code: str): + return f'api:xmlFragment.{language_code}' + + +def _api_get_original_speech(data): + _, code = _api_get_language_data(data['originalLanguage'][0]) + return data.get(_api_speech_key(code)) + + def first(values): if len(values): return values[0] +class _JSON(JSON): + ''' + Edited JSON extractor that also accepts 0 keys to return the object as-is + ''' + # TODO: make this change in ianalyzer_readers + + def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs): + if not len(self.keys): + return data + return super()._apply(data, key_index, **kwargs) + + class ParliamentEuropeFromAPI(JSONReader): """ Speeches of the European parliament, originally in or translated to English, @@ -338,7 +359,7 @@ def iterate_data(self, data: Dict, metadata): records = list(super().iterate_data(filtered_data, metadata)) filtered_records = [ record for record in records - if record['data'].get('api:xmlFragment.en') + if record['data'].get(_api_speech_key('en')) ] return filtered_records @@ -433,7 +454,16 @@ def iterate_data(self, data: Dict, metadata): Field( name='speech', extractor=JSON( - "api:xmlFragment.en", + _api_speech_key('en'), + transform=api_convert_xml, + ) + ), + Field( + name='speech_original', + extractor=Pass( + _JSON( + transform=_api_get_original_speech + ), transform=api_convert_xml, ) ), From 319f1e324fa85b51865258ea1f98940ecc149c61 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 15:38:36 +0100 Subject: [PATCH 17/39] add speaker metadata in api corpus --- backend/corpora/parliament/euparl.py | 89 ++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 204da7a10..a5194fbd3 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -210,6 +210,19 @@ def api_get_speaker_country(participant: str) -> Optional[str]: return api_get_preflabel(citizenship) +def _api_get_speaker_gender(participant: str) -> Optional[str]: + speaker_metadata = api_get_speaker_info(participant) + gender_uri = speaker_metadata.get('hasGender') + if gender_uri: + return gender_uri.split('/')[-1].title() + +def _api_get_speaker_birth_year(participant: str) -> Optional[int]: + speaker_metadata = api_get_speaker_info(participant) + birth_date = speaker_metadata.get('bday') + if birth_date: + d = datetime.strptime(birth_date, '%Y-%m-%d') + return d.year + def api_get_speaker_name(participant: str) -> str: speaker_metadata = api_get_speaker_info(participant) given_name = speaker_metadata.get('givenName') @@ -219,27 +232,34 @@ def api_get_speaker_name(participant: str) -> str: @cache -def api_get_party_id(data) -> dict: +def api_get_party_id(data: Tuple[str, datetime]) -> dict: participant, date = data + return _api_select_party(participant, date, 'def/ep-entities/EU_POLITICAL_GROUP') + +def _api_select_party(participant: str, date: datetime, classification: str) -> Optional[str]: speaker_metadata = api_get_speaker_info(participant) memberships = speaker_metadata.get('hasMembership') or [] for membership in memberships: - if ( - membership.get('membershipClassification') - != 'def/ep-entities/EU_POLITICAL_GROUP' - ): + if membership.get('membershipClassification') != classification: continue membership_period = membership.get('memberDuring') end_date = membership_period.get('endDate', datetime.now().strftime('%Y-%m-%d')) if membership_period.get('startDate') <= date <= end_date: return membership.get('organization').split('/')[-1] - return '' + +def _api_get_national_party_id(data: Tuple[str, datetime]) -> Optional[str]: + participant, date = data + return _api_select_party(participant, date, 'def/ep-entities/NATIONAL_POLITICAL_GROUP') def api_get_party_name(data) -> Optional[str]: party_id = api_get_party_id(data) return api_get_party_name_from_id(party_id) +def _api_get_national_party_name(data: Tuple[str, datetime]) -> Optional[str]: + party_id = _api_get_national_party_id(data) + return api_get_party_name_from_id(party_id, False) + _party_name_replacements = { 'The Left': 'GUENGL', 'Verts/ALE': 'GEFA', @@ -248,16 +268,13 @@ def api_get_party_name(data) -> Optional[str]: 'Replaces some party labels with the ones used in the 1999-2024 datasets' @cache -def api_get_party_name_from_id(party_id: str) -> str: - if not party_id: - return None - party_response = requests.get( - f'https://data.europarl.europa.eu/api/v2/corporate-bodies/{party_id}?format=application%2Fld%2Bjson&language=en' - ) - if party_response.status_code != 200: - return None - label = party_response.json().get('data')[0].get('label') - return _party_name_replacements.get(label, label) +def api_get_party_name_from_id(party_id: str, replacements=True) -> str: + data = _api_get_party_metadata(party_id) + if data: + label = data.get('data')[0].get('label') + if replacements: + return _party_name_replacements.get(label, label) + return label def _api_get_party_full_name(data) -> Optional[str]: @@ -267,6 +284,12 @@ def _api_get_party_full_name(data) -> Optional[str]: @cache def _api_get_party_full_name_from_id(party_id: str) -> str: + data = _api_get_party_metadata(party_id) + if data: + return data.get('data')[0].get('altLabel').get('en') + +@cache +def _api_get_party_metadata(party_id: str) -> Dict: if not party_id: return None party_response = requests.get( @@ -274,8 +297,7 @@ def _api_get_party_full_name_from_id(party_id: str) -> str: ) if party_response.status_code != 200: return None - return party_response.json().get('data')[0].get('altLabel').get('en') - + return party_response.json() def _api_speech_key(language_code: str): return f'api:xmlFragment.{language_code}' @@ -409,6 +431,17 @@ def iterate_data(self, data: Dict, metadata): transform=api_get_party_id, ) ), + Field( + name='party_national', + extractor=Combined( + JSON( + "data.had_participation.had_participant_person", + transform=first + ), + Metadata('date'), + transform=_api_get_national_party_name, + ) + ), Field( name='sequence', extractor=Metadata('sequence') @@ -441,6 +474,26 @@ def iterate_data(self, data: Dict, metadata): transform=api_get_speaker_country, ), ), + Field( + name='speaker_gender', + extractor=Pass( + JSON( + 'data.had_participation.had_participant_person', + transform=first, + ), + transform=_api_get_speaker_gender, + ) + ), + Field( + name='speaker_birth_year', + extractor=Pass( + JSON( + 'data.had_participation.had_participant_person', + transform=first, + ), + transform=_api_get_speaker_birth_year, + ) + ), Field( name='speaker_id', extractor=Pass( From 9f9e9f9efb74270c84b922b598f12706dce4e5e5 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 15:46:49 +0100 Subject: [PATCH 18/39] update test data --- backend/corpora/parliament/conftest.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index 65e27f927..fa23aee59 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -710,7 +710,7 @@ def parliament_corpora_settings(settings): "universal suffrage and in Article 10 (3) of Parliament's Rules " 'of Procedure. Mr Crowley has asked for the floor on a point of ' 'order.', - 'language': 'Italian', + 'original_language': 'Italian', 'sequence': 1, 'source_archive': 'EUPDCorp', }, @@ -728,14 +728,17 @@ def parliament_corpora_settings(settings): "id": "MTG-PL-2024-11-13-OTH-2017005042457", "party": 'ECR', 'party_full': 'European Conservatives and Reformists', + 'party_national': None, "party_id": "7037", - "language": "English", + 'original_language': "English", "sequence": 1, "speaker": "Roberts Zīle", "speaker_country": "Latvia", + 'speaker_gender': 'Male', + 'speaker_birth_year': 1958, "speaker_id": "28615", - "speech": "Thank you, Commissioner McGuinness, and I would also like to thank you for your work on the AML package and many other issues, also for today's issues. Thank you very much.", - 'source_archive': 'Europarl Open Data', + "speech": "Thank you, Commissioner McGuinness, and I would also like to thank you for your work on the AML package and many other issues, also for today's issues. Thank you very much.\n\nThe concludes the item.", + 'source_archive': 'European Parliament Open Data API', } ], "n_documents": 6, From 5e732fb625ac8fab4d42520a2bf3aa3eb6332184 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 16:51:47 +0100 Subject: [PATCH 19/39] url utility function for eu api --- backend/corpora/parliament/euparl.py | 30 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index a5194fbd3..c389ec1b5 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -3,6 +3,7 @@ import logging import os from typing import Optional, Dict, List, Tuple, Union +from urllib import parse from bs4 import BeautifulSoup from django.conf import settings @@ -152,6 +153,17 @@ def __init__(self): ] +def _api_url(path: str, query: Dict = dict()) -> str: + full_path = parse.urljoin('/api/v2/', path) + base_query = { + 'format': 'application/ld+json', + 'User-Agent': 'textcavator', + } + query_string = parse.urlencode(base_query | query) + url = parse.urlunsplit(['https', 'data.europarl.europa.eu', full_path, query_string, '']) + return url + + def api_convert_xml(speech_xml: str) -> str: speech_soup = BeautifulSoup(speech_xml, 'lxml-xml') paragraphs = speech_soup.find_all('p') @@ -193,9 +205,8 @@ def api_get_preflabel(url: str) -> Optional[str]: def api_get_speaker_info(participant: str) -> dict: '''Query metadata about the speaker, unless it's already been queried before''' speaker_id = api_get_speaker_id(participant) - speaker_response = requests.get( - f'https://data.europarl.europa.eu/api/v2/meps/{speaker_id}?format=application%2Fld%2Bjson' - ) + speaker_url = _api_url(f'meps/{speaker_id}') + speaker_response = requests.get(speaker_url) if not speaker_response.status_code == 200: logger.warning(f"No response for person {speaker_id}") return {} @@ -292,9 +303,8 @@ def _api_get_party_full_name_from_id(party_id: str) -> str: def _api_get_party_metadata(party_id: str) -> Dict: if not party_id: return None - party_response = requests.get( - f'https://data.europarl.europa.eu/api/v2/corporate-bodies/{party_id}?format=application%2Fld%2Bjson&language=en' - ) + party_url = _api_url(f'corporate-bodies/{party_id}', {'language': 'en'}) + party_response = requests.get(party_url) if party_response.status_code != 200: return None return party_response.json() @@ -345,8 +355,9 @@ def sources(self, **kwargs): date += timedelta(days=1) formatted_date = date.strftime('%Y-%m-%d') meeting_id = f'MTG-PL-{formatted_date}' + meeting_url = _api_url(f'meetings/{meeting_id}/activities') response = requests.get( - f'https://data.europarl.europa.eu/api/v2/meetings/{meeting_id}/activities?format=application%2Fld%2Bjson', + meeting_url, headers={'accept': 'application/ld+json'}, ) if response.status_code != 200: @@ -363,9 +374,8 @@ def sources(self, **kwargs): for speech in event.get('consists_of'): speech_id = speech.split("/")[-1] - speech_response = requests.get( - f'https://data.europarl.europa.eu/api/v2/speeches/{speech_id}?include-output=xml_fragment&format=application%2Fld%2Bjson' - ) + speech_url = _api_url(f'speeches/{speech_id}', {'include-output': 'xml_fragment'}) + speech_response = requests.get(speech_url) if speech_response.status_code != 200: continue sequence_in_debate += 1 From 1f8a317918fca59f348c980d120d5881449fd7c6 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 16:52:20 +0100 Subject: [PATCH 20/39] remove url field --- backend/corpora/parliament/euparl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index c389ec1b5..9ed61a642 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -121,7 +121,6 @@ def source2dicts(self, source, **kwargs): ) speech_id = field_defaults.speech_id() - url = field_defaults.url() source_archive = FieldDefinition( name='source_archive', display_name='Source archive', @@ -148,7 +147,6 @@ def __init__(self): self.speech, self.speech_original, self.speech_id, - self.url, self.source_archive, ] From 4e34080a9064f4b6b1dcea651c44d6069226a0da Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 16:52:44 +0100 Subject: [PATCH 21/39] remove rdf test files --- .../tests/data/euparl/rdf/English.ttl | 22 - .../data/euparl/rdf/Events_and_structure.ttl | 98 --- .../rdf/MembersOfParliament_background.ttl | 807 ------------------ 3 files changed, 927 deletions(-) delete mode 100644 backend/corpora/parliament/tests/data/euparl/rdf/English.ttl delete mode 100644 backend/corpora/parliament/tests/data/euparl/rdf/Events_and_structure.ttl delete mode 100644 backend/corpora/parliament/tests/data/euparl/rdf/MembersOfParliament_background.ttl diff --git a/backend/corpora/parliament/tests/data/euparl/rdf/English.ttl b/backend/corpora/parliament/tests/data/euparl/rdf/English.ttl deleted file mode 100644 index f58eb00a2..000000000 --- a/backend/corpora/parliament/tests/data/euparl/rdf/English.ttl +++ /dev/null @@ -1,22 +0,0 @@ -@prefix foaf: . -@prefix xsd: . -@prefix rdf: . -@prefix lpv_eu: . -@prefix lpv: . -@prefix lp_eu: . -@prefix lp: . -@prefix dcterms: . - -lp_eu:1999-07-21-Speech-3-063 lpv:translatedText "(IT) Mr President, as a Member of the Italian national Parliament for the\n(The Northern League for the Independence of Padania), I did not vote for Professor Prodi in Rome as I considered he would be completely useless as head of government. I was then proved right as he lost the vote of confidence of the Italian Parliament. Reckoning also that a Roman idiot would still be that stupid wherever he was, which, incidently, is reflected in the symbol on the list which bears his name for the election of this Parliament, I cannot for consistency\"s sake express my faith in the President of the Commission. As a native of the Po valley who is Italian only by passport, I am fortunately immune from the national Christian Democrat type of opportunism which brings Berlusconi together with Mastella and De Mita and sees in Prodi not the impartial President of the Commissioners uninfluenced by the States, but the lavish dispenser of favours to a wide and varied assortment of Southern Italian profiteers. Although I hold some of the Commissioners in high esteem, I recall the old mafioso Neapolitan saying: ‘A fish rots from the head downwards’ and I therefore have to express my negative opinion of the Prodi Presidency."@en . -lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Lega Nord per l'indipendenza della Padania" . -lp_eu:1999-07-21-Speech-3-063 lpv:unclassifiedMetadata "Speroni (NI)" . - -lp_eu:2017-07-06-Speech-4-146-000 lpv:spokenText "Mr President, yesterday afternoon we had a lively debate, under Rule 153, on the subject of a single seat for this Parliament. Unfortunately, under that rule, it was not possible to have a resolution, but it was the clear will of this House that we bring forward a report to propose a treaty change. So, as Mr Weber and Mr Pittella are in their seats, could they please take note of the view of this House and, when the matter comes to the Conference of Presidents, could they please authorise that report?"@en . -lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "(Applause)" . -lp_eu:2017-07-06-Speech-4-146-000 lpv:unclassifiedMetadata "Ashley Fox (ECR )." . - -lp_eu:1999-07-21_AgendaItem_5 dcterms:title "Statement by Mr Prodi, President-elect of the Commission"@en . - -lp_eu:2009-03-24_AgendaItem_30 dcterms:title "EIB and EBRD annual reports for 2007 - Community guarantee to the European Investment Bank (debate)"@en . - -lp_eu:2017-07-06_AgendaItem_13 dcterms:title "Composition of committees and delegations"@en . diff --git a/backend/corpora/parliament/tests/data/euparl/rdf/Events_and_structure.ttl b/backend/corpora/parliament/tests/data/euparl/rdf/Events_and_structure.ttl deleted file mode 100644 index 5981bbcba..000000000 --- a/backend/corpora/parliament/tests/data/euparl/rdf/Events_and_structure.ttl +++ /dev/null @@ -1,98 +0,0 @@ -@prefix foaf: . -@prefix xsd: . -@prefix rdf: . -@prefix lpv_eu: . -@prefix lpv: . -@prefix lp_eu: . -@prefix lp: . -@prefix dcterms: . - -lp_eu:1999-07-21-Speech-3-063 a lpv_eu:Speech . -lp_eu:1999-07-21-Speech-3-063 dcterms:date "1999-07-21"^^xsd:date . -lp_eu:1999-07-21-Speech-3-063 dcterms:isPartOf lp_eu:1999-07-21_AgendaItem_5 . -lp_eu:1999-07-21-Speech-3-063 dcterms:language "it"^^xsd:language . -lp_eu:1999-07-21-Speech-3-063 lpv:docno "en.19990721.5.3-063" . -lp_eu:1999-07-21-Speech-3-063 lpv:hasSubsequent lp_eu:1999-07-21-Speech-3-064 . -lp_eu:1999-07-21-Speech-3-063 lpv:speaker lp:EUmember_997 . - -lp_eu:2017-07-06-Speech-4-146-000 a lpv_eu:Speech . -lp_eu:2017-07-06-Speech-4-146-000 dcterms:date "2017-07-06"^^xsd:date . -lp_eu:2017-07-06-Speech-4-146-000 dcterms:isPartOf lp_eu:2017-07-06_AgendaItem_13 . -lp_eu:2017-07-06-Speech-4-146-000 dcterms:language "en"^^xsd:language . -lp_eu:2017-07-06-Speech-4-146-000 lpv:docno "en.20170706.13.4-146-000" . -lp_eu:2017-07-06-Speech-4-146-000 lpv:hasSubsequent lp_eu:2017-07-06-Speech-4-147-000 . -lp_eu:2017-07-06-Speech-4-146-000 lpv:speaker lp:EUmember_96957 . -lp_eu:2017-07-06-Speech-4-146-000 lpv:videoURI . - -lp_eu:1999-07-21_AgendaItem_5 a lpv_eu:AgendaItem . -lp_eu:1999-07-21_AgendaItem_5 dcterms:date "1999-07-21"^^xsd:date . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-049 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-050 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-051 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-052 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-053 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-054 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-055 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-056 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-057 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-058 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-059 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-060 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-061 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-062 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-063 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-064 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-065 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-066 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-067 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-068 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-069 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-070 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-071 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-072 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:hasPart lp_eu:1999-07-21-Speech-3-996 . -lp_eu:1999-07-21_AgendaItem_5 dcterms:isPartOf lp_eu:1999-07-21_SessionDay . -lp_eu:1999-07-21_AgendaItem_5 lpv:docno "en.19990721.5" . -lp_eu:1999-07-21_AgendaItem_5 lpv:hasSubsequent lp_eu:1999-07-21_AgendaItem_6 . - -lp_eu:2009-03-24_AgendaItem_30 a lpv_eu:AgendaItem . -lp_eu:2009-03-24_AgendaItem_30 dcterms:date "2009-03-24"^^xsd:date . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-353 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-354 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-355 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-356 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-357 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-358 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-359 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-360 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-361 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-362 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-363 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-364 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-365 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-366 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-367 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-368 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-369 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-370 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-371 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-372 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-373 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-374 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-375 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-376 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:hasPart lp_eu:2009-03-24-Speech-2-377 . -lp_eu:2009-03-24_AgendaItem_30 dcterms:isPartOf lp_eu:2009-03-24_SessionDay . -lp_eu:2009-03-24_AgendaItem_30 lpv:docno "en.20090324.30" . -lp_eu:2009-03-24_AgendaItem_30 lpv:hasSubsequent lp_eu:2009-03-24_AgendaItem_31 . - -lp_eu:2017-07-06_AgendaItem_13 a lpv_eu:AgendaItem . -lp_eu:2017-07-06_AgendaItem_13 dcterms:date "2017-07-06"^^xsd:date . -lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-145-000 . -lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-146-000 . -lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-147-000 . -lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-148-000 . -lp_eu:2017-07-06_AgendaItem_13 dcterms:hasPart lp_eu:2017-07-06-Speech-4-149-000 . -lp_eu:2017-07-06_AgendaItem_13 dcterms:isPartOf lp_eu:2017-07-06_SessionDay . -lp_eu:2017-07-06_AgendaItem_13 lpv:docno "en.20170706.13" . -lp_eu:2017-07-06_AgendaItem_13 lpv:hasSubsequent lp_eu:2017-07-06_AgendaItem_14 . diff --git a/backend/corpora/parliament/tests/data/euparl/rdf/MembersOfParliament_background.ttl b/backend/corpora/parliament/tests/data/euparl/rdf/MembersOfParliament_background.ttl deleted file mode 100644 index ea1cd2748..000000000 --- a/backend/corpora/parliament/tests/data/euparl/rdf/MembersOfParliament_background.ttl +++ /dev/null @@ -1,807 +0,0 @@ -@prefix foaf: . -@prefix lp: . -@prefix lpv: . -@prefix ns1: . -@prefix rdfs: . -@prefix xsd: . - - a lpv:EUParty ; - rdfs:label "AGRI", - "European Democratic Group" ; - lpv:acronym "AGRI" ; - lpv:featuredRoleDescriptions "European Democratic Group - Chair", - "European Democratic Group - Member", - "European Democratic Group - Member of the Bureau", - "European Democratic Group - Treasurer", - "European Democratic Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "ALDE", - "Group of the Alliance of Liberals and Democrats for Europe" ; - lpv:acronym "ALDE" ; - lpv:featuredRoleDescriptions "Group of the Alliance of Liberals and Democrats for Europe -", - "Group of the Alliance of Liberals and Democrats for Europe - Chair", - "Group of the Alliance of Liberals and Democrats for Europe - Member", - "Group of the Alliance of Liberals and Democrats for Europe - Member of the Bureau", - "Group of the Alliance of Liberals and Democrats for Europe - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "COM", - "Communist and Allies Group" ; - lpv:acronym "COM" ; - lpv:featuredRoleDescriptions "Communist and Allies Group -", - "Communist and Allies Group - Chair", - "Communist and Allies Group - Member", - "Communist and Allies Group - Treasurer", - "Communist and Allies Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "DR", - "Technical Group of the European Right" ; - lpv:acronym "DR" ; - lpv:featuredRoleDescriptions "Technical Group of the European Right -", - "Technical Group of the European Right - Chair", - "Technical Group of the European Right - Member", - "Technical Group of the European Right - Member of the Bureau", - "Technical Group of the European Right - Treasurer", - "Technical Group of the European Right - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EDA", - "Group of the European Democratic Alliance" ; - lpv:acronym "EDA" ; - lpv:featuredRoleDescriptions "Group of the European Democratic Alliance - Chair", - "Group of the European Democratic Alliance - Member", - "Group of the European Democratic Alliance - Member of the Bureau", - "Group of the European Democratic Alliance - Treasurer", - "Group of the European Democratic Alliance - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EDD", - "Group for a Europe of Democracies and Diversities" ; - lpv:acronym "EDD" ; - lpv:featuredRoleDescriptions "Group for a Europe of Democracies and Diversities - Chair", - "Group for a Europe of Democracies and Diversities - Co-Chair", - "Group for a Europe of Democracies and Diversities - Member", - "Group for a Europe of Democracies and Diversities - Member of the Bureau" . - - a lpv:EUParty ; - rdfs:label "EFDD", - "Europe of Freedom and Direct Democracy Group" ; - lpv:acronym "EFDD" ; - lpv:featuredRoleDescriptions "Europe of Freedom and Direct Democracy Group - Chair of the Bureau", - "Europe of Freedom and Direct Democracy Group - Co-Chair", - "Europe of Freedom and Direct Democracy Group - Member", - "Europe of Freedom and Direct Democracy Group - Member of the Bureau", - "Europe of Freedom and Direct Democracy Group - Treasurer", - "Europe of Freedom and Direct Democracy Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "ELDR", - "Group of the European Liberal, Democrat and Reform Party" ; - lpv:acronym "ELDR" ; - lpv:featuredRoleDescriptions "Group of the European Liberal, Democrat and Reform Party -", - "Group of the European Liberal, Democrat and Reform Party - Chair", - "Group of the European Liberal, Democrat and Reform Party - Member", - "Group of the European Liberal, Democrat and Reform Party - Member of the Bureau", - "Group of the European Liberal, Democrat and Reform Party - Treasurer", - "Group of the European Liberal, Democrat and Reform Party - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EN", - "Europe of Nations Group (Coordination Group)" ; - lpv:acronym "EN" ; - lpv:featuredRoleDescriptions "Europe of Nations Group (Coordination Group) -", - "Europe of Nations Group (Coordination Group) - Chair", - "Europe of Nations Group (Coordination Group) - Member", - "Europe of Nations Group (Coordination Group) - Treasurer", - "Europe of Nations Group (Coordination Group) - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EPD", - "Group of European Progressive Democrats" ; - lpv:acronym "EPD" ; - lpv:featuredRoleDescriptions "Group of European Progressive Democrats - Chair", - "Group of European Progressive Democrats - Member", - "Group of European Progressive Democrats - Member of the Bureau", - "Group of European Progressive Democrats - Treasurer", - "Group of European Progressive Democrats - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EPP", - "Group of the European People's Party (Christian Democrats)", - "Group of the European People's Party (Christian-Democratic Group)" ; - lpv:acronym "EPP" ; - lpv:featuredRoleDescriptions "Group of the European People's Party (Christian Democrats) -", - "Group of the European People's Party (Christian Democrats) - Chair", - "Group of the European People's Party (Christian Democrats) - Member", - "Group of the European People's Party (Christian Democrats) - Member of the Bureau", - "Group of the European People's Party (Christian Democrats) - Vice-Chair", - "Group of the European People's Party (Christian-Democratic Group) -", - "Group of the European People's Party (Christian-Democratic Group) - Chair", - "Group of the European People's Party (Christian-Democratic Group) - Member", - "Group of the European People's Party (Christian-Democratic Group) - Member of the Bureau", - "Group of the European People's Party (Christian-Democratic Group) - Treasurer", - "Group of the European People's Party (Christian-Democratic Group) - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "EPP-ED", - "Group of the European People's Party (Christian Democrats) and European Democrats" ; - lpv:acronym "EPP-ED" ; - lpv:featuredRoleDescriptions "Group of the European People's Party (Christian Democrats) and European Democrats -", - "Group of the European People's Party (Christian Democrats) and European Democrats - Chair", - "Group of the European People's Party (Christian Democrats) and European Democrats - Member", - "Group of the European People's Party (Christian Democrats) and European Democrats - Member of the Bureau", - "Group of the European People's Party (Christian Democrats) and European Democrats - Treasurer", - "Group of the European People's Party (Christian Democrats) and European Democrats - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "ER", - "Group of the European Right" ; - lpv:acronym "ER" ; - lpv:featuredRoleDescriptions "Group of the European Right -", - "Group of the European Right - Chair", - "Group of the European Right - Member", - "Group of the European Right - Member of the Bureau", - "Group of the European Right - Treasurer", - "Group of the European Right - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "ERA", - "Group of the European Radical Alliance" ; - lpv:acronym "ERA" ; - lpv:featuredRoleDescriptions "Group of the European Radical Alliance - Chair", - "Group of the European Radical Alliance - Member", - "Group of the European Radical Alliance - Member of the Bureau", - "Group of the European Radical Alliance - Treasurer" . - - a lpv:EUParty ; - rdfs:label "Confederal Group of the European United Left", - "EUL", - "Group for the European United Left" ; - lpv:acronym "EUL" ; - lpv:featuredRoleDescriptions "Confederal Group of the European United Left - Chair", - "Confederal Group of the European United Left - Member", - "Confederal Group of the European United Left - Treasurer", - "Confederal Group of the European United Left - Vice-Chair", - "Group for the European United Left - Chair", - "Group for the European United Left - Member", - "Group for the European United Left - Member of the Bureau", - "Group for the European United Left - Treasurer", - "Group for the European United Left - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "Confederal Group of the European United Left - Nordic Green Left", - "Confederal Group of the European United Left/Nordic Green Left", - "EUL/NGL" ; - lpv:acronym "EUL/NGL" ; - lpv:featuredRoleDescriptions "Confederal Group of the European United Left - Nordic Green Left -", - "Confederal Group of the European United Left - Nordic Green Left - Chair", - "Confederal Group of the European United Left - Nordic Green Left - Member", - "Confederal Group of the European United Left - Nordic Green Left - Member of the Bureau", - "Confederal Group of the European United Left - Nordic Green Left - Treasurer", - "Confederal Group of the European United Left - Nordic Green Left - Vice-Chair", - "Confederal Group of the European United Left/Nordic Green Left -", - "Confederal Group of the European United Left/Nordic Green Left - Chair", - "Confederal Group of the European United Left/Nordic Green Left - Member", - "Confederal Group of the European United Left/Nordic Green Left - Member of the Bureau", - "Confederal Group of the European United Left/Nordic Green Left - Treasurer", - "Confederal Group of the European United Left/Nordic Green Left - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "FE", - "Forza Europa Group" ; - lpv:acronym "FE" ; - lpv:featuredRoleDescriptions "Forza Europa Group - Chair", - "Forza Europa Group - Member", - "Forza Europa Group - Member of the Bureau", - "Forza Europa Group - Treasurer", - "Forza Europa Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "G", - "The Green Group in the European Parliament" ; - lpv:acronym "G" ; - lpv:featuredRoleDescriptions "The Green Group in the European Parliament - Chair", - "The Green Group in the European Parliament - Member", - "The Green Group in the European Parliament - Member of the Bureau", - "The Green Group in the European Parliament - Treasurer", - "The Green Group in the European Parliament - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "G/EFA", - "Group of the Greens/European Free Alliance" ; - lpv:acronym "G/EFA" ; - lpv:featuredRoleDescriptions "Group of the Greens/European Free Alliance -", - "Group of the Greens/European Free Alliance - Chair", - "Group of the Greens/European Free Alliance - Co-Chair", - "Group of the Greens/European Free Alliance - Member", - "Group of the Greens/European Free Alliance - Member of the Bureau", - "Group of the Greens/European Free Alliance - Treasurer", - "Group of the Greens/European Free Alliance - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "Group of Independents for a Europe of Nations", - "I-EN" ; - lpv:acronym "I-EN" ; - lpv:featuredRoleDescriptions "Group of Independents for a Europe of Nations -", - "Group of Independents for a Europe of Nations - Chair", - "Group of Independents for a Europe of Nations - Co-Chair", - "Group of Independents for a Europe of Nations - Member", - "Group of Independents for a Europe of Nations - Treasurer", - "Group of Independents for a Europe of Nations - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "ITS", - "Identity, Tradition and Sovereignty Group" ; - lpv:acronym "ITS" ; - lpv:featuredRoleDescriptions "Identity, Tradition and Sovereignty Group - Chair", - "Identity, Tradition and Sovereignty Group - Member", - "Identity, Tradition and Sovereignty Group - Member of the Bureau", - "Identity, Tradition and Sovereignty Group - Treasurer", - "Identity, Tradition and Sovereignty Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "LD", - "Liberal and Democratic Group" ; - lpv:acronym "LD" ; - lpv:featuredRoleDescriptions "Liberal and Democratic Group -", - "Liberal and Democratic Group - Chair", - "Liberal and Democratic Group - Member", - "Liberal and Democratic Group - Treasurer", - "Liberal and Democratic Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "LDR", - "Liberal and Democratic Reformist Group" ; - lpv:acronym "LDR" ; - lpv:featuredRoleDescriptions "Liberal and Democratic Reformist Group -", - "Liberal and Democratic Reformist Group - Chair", - "Liberal and Democratic Reformist Group - Member", - "Liberal and Democratic Reformist Group - Treasurer", - "Liberal and Democratic Reformist Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "LU", - "Left Unity" ; - lpv:acronym "LU" ; - lpv:featuredRoleDescriptions "Left Unity -", - "Left Unity - Chair", - "Left Unity - Member", - "Left Unity - Treasurer", - "Left Unity - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "Group of the Party of European Socialists", - "PES", - "Socialist Group in the European Parliament" ; - lpv:acronym "PES" ; - lpv:featuredRoleDescriptions "Group of the Party of European Socialists -", - "Group of the Party of European Socialists - Chair", - "Group of the Party of European Socialists - Member", - "Group of the Party of European Socialists - Member of the Bureau", - "Group of the Party of European Socialists - Treasurer", - "Group of the Party of European Socialists - Vice-Chair", - "Socialist Group in the European Parliament -", - "Socialist Group in the European Parliament - Chair", - "Socialist Group in the European Parliament - Member", - "Socialist Group in the European Parliament - Treasurer", - "Socialist Group in the European Parliament - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament", - "S&D" ; - lpv:acronym "S&D" ; - lpv:featuredRoleDescriptions "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament -", - "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Chair", - "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Member", - "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Treasurer", - "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "SOC", - "Socialist Group" ; - lpv:acronym "SOC" ; - lpv:featuredRoleDescriptions "Socialist Group -", - "Socialist Group - Chair", - "Socialist Group - Member", - "Socialist Group - Member of the Bureau", - "Socialist Group - Treasurer", - "Socialist Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "Group Union for Europe", - "UFE" ; - lpv:acronym "UFE" ; - lpv:featuredRoleDescriptions "Group Union for Europe - Chair", - "Group Union for Europe - Member", - "Group Union for Europe - Member of the Bureau", - "Group Union for Europe - Treasurer", - "Group Union for Europe - Vice-Chair" . - -lp:EUmember_96957 a ns1:MemberOfParliament ; - ns1:MEP_ID "96957" ; - ns1:countryOfRepresentation lp:EUCountry_GB ; - ns1:dateOfBirth "1969-11-15"^^xsd:date ; - ns1:placeOfBirth "Sutton Coldfield" ; - ns1:politicalFunction lp:pf00f88c83af0e95f9d4ba9f0c9bd0a093, - lp:pf0f78c03fefe0985f764dea70022d0043, - lp:pf2bdc66e4eb24629cd4d221b54b45a3ca, - lp:pf350caf782f08b96e2a3af35fc5616001, - lp:pf399a6d922ff4c20a852fc22d6600906e, - lp:pf52bab8d1a349df571ebeb55761db513c, - lp:pf58aecb3742c3fe35d5080e16645b07c8, - lp:pf8240375d0714e974c78fb753d862122a, - lp:pf99c4158caafcfba5bc9fda500f3d0c60, - lp:pfba1033551486f93285992b1f262dc79e, - lp:pfc8ae7a828c60553e27378b19a12a862a, - lp:pfcedffd5e8d734d1203db8fa3f7038f78, - lp:pfd9732399a8c6bbb7c5e041837244a2f8, - lp:pfe61b1fb146891c9d4d0bbf88349d10f8, - lp:pfe6b98484dd76e8c4c9e3e9783f2b0583, - lp:pff56829256d54fb49a895889d87467924 ; - foaf:name "Ashley Fox" . - -lp:EUmember_997 a ns1:MemberOfParliament ; - ns1:MEP_ID "997" ; - ns1:countryOfRepresentation lp:EUCountry_IT ; - ns1:dateOfBirth "1946-10-04"^^xsd:date ; - ns1:placeOfBirth "Busto Arsizio" ; - ns1:politicalFunction lp:pf00850f07d83ff7511d816f2906631631, - lp:pf1617b45a9e5df3163b8429577b4a3555, - lp:pf18c223c3363102672b51b21c184056cc, - lp:pf1ef5ada5f958b0aba640e490e460a221, - lp:pf2170f06ac1f449772b48a5e844b1f7cd, - lp:pf232468ecbffeb758cc38b17eb407a0e8, - lp:pf27013a6352e31f1747cb0925a8d315a9, - lp:pf2cff760ac2aa89d9fb6a7c65e2055670, - lp:pf3520fdebb83d60437179f91787dfecf9, - lp:pf36596b04e9f7439a17fd0764130a0a16, - lp:pf4009f4a6d184651935c94eaf0e8e3b31, - lp:pf42381eeb3b04b2383aa048ad07ce1723, - lp:pf46b57676499000ff10b62596ebaaff0a, - lp:pf4e79d2756b56075d1ba8ead84562bd8b, - lp:pf50f29b8e60bb70630a741c75d9e15850, - lp:pf5520a69a514cdd1f889ff24e68b34128, - lp:pf57173b89fb6ac13826d452fc74067ff3, - lp:pf5a2de253c92e0c12b84cac7c1e155666, - lp:pf6356d7122a0fb25ece5db75f61c19671, - lp:pf64ca0f3f6dde49e4a542560ed5122891, - lp:pf6579712af153fc6ce1b0db2de821f92d, - lp:pf76dd5ad131d30ed3306a78709e7ce251, - lp:pf870c0cf3766606d20cec0aecf1851569, - lp:pf8e8990b89425d1d5fefd0526973a771a, - lp:pf8e9b064d23472496eb5097e9e61f664f, - lp:pf9d156e058e9b318c9b9df62e3a5ab42a, - lp:pfb592b70d3ae90102ecf4f9cf0dbbf362, - lp:pfb8d606b79e721a9f0dd3232ebbe34ca4, - lp:pfbb4b02d1622a4615842f6f422386345c, - lp:pfc0aa2605dab67ed5619e571ffaa06fbc, - lp:pfc4defbc2ab4682bb4ffda87d34391823, - lp:pfd69065d8553a1a36e8aa0811ef83e0d3, - lp:pfd941468c3faf619ad5d0a0156420880c, - lp:pfddd269fab021f83ba0a89efc4298e439, - lp:pfe33ac318848090cbc8bce7f6e4f26f6f, - lp:pff4a2c78f09a9956180c64fe9dc17b9c1, - lp:pff75a3ca6e96a14db43823be10fb693d2, - lp:pffa15145f38d5a44e3ef93a4534a2df63, - lp:pffeee7811d1a25e86cc33849665eeb484 ; - foaf:name "Francesco Enrico Speroni" . - -lp:EUCountry_GB rdfs:label "United Kingdom"@en . - -lp:EUCountry_IT rdfs:label "Italy"@en . - - a lpv:EUParty ; - rdfs:label "EFD", - "Europe of freedom and democracy Group" ; - lpv:acronym "EFD" ; - lpv:featuredRoleDescriptions "Europe of freedom and democracy Group -", - "Europe of freedom and democracy Group - Chair of the Bureau", - "Europe of freedom and democracy Group - Co-Chair", - "Europe of freedom and democracy Group - Member", - "Europe of freedom and democracy Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "IND/DEM", - "Independence/Democracy Group" ; - lpv:acronym "IND/DEM" ; - lpv:featuredRoleDescriptions "Independence/Democracy Group - Chair", - "Independence/Democracy Group - Chair of the Bureau", - "Independence/Democracy Group - Co-Chair", - "Independence/Democracy Group - Member", - "Independence/Democracy Group - Member of the Bureau", - "Independence/Democracy Group - Treasurer" . - - a lpv:EUParty ; - rdfs:label "RBW", - "Rainbow Group in the European Parliament", - "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament" ; - lpv:acronym "RBW" ; - lpv:featuredRoleDescriptions "Rainbow Group in the European Parliament - Chair", - "Rainbow Group in the European Parliament - Member", - "Rainbow Group in the European Parliament - Member of the Bureau", - "Rainbow Group in the European Parliament - Treasurer", - "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament - Chair", - "Rainbow Group: Federation of the Green-Alternative European Links, Agelev-Ecolo, the Danish People's Movement against Membership of the European Community and the European Free Alliance in the European Parliament - Member" . - - a lpv:EUParty ; - rdfs:label "UEN", - "Union for Europe of the Nations Group" ; - lpv:acronym "UEN" ; - lpv:featuredRoleDescriptions "Union for Europe of the Nations Group -", - "Union for Europe of the Nations Group - Chair", - "Union for Europe of the Nations Group - Co-Chair", - "Union for Europe of the Nations Group - Member", - "Union for Europe of the Nations Group - Treasurer", - "Union for Europe of the Nations Group - Vice-Chair" . - -lp:pf00850f07d83ff7511d816f2906631631 a ns1:PoliticalFunction ; - ns1:beginning "1999-07-21"^^xsd:date ; - ns1:end "2002-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf00f88c83af0e95f9d4ba9f0c9bd0a093 a ns1:PoliticalFunction ; - ns1:beginning "2014-07-01"^^xsd:date ; - ns1:end "2017-03-31"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf0f78c03fefe0985f764dea70022d0043 a ns1:PoliticalFunction ; - ns1:beginning "2012-01-25"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf1617b45a9e5df3163b8429577b4a3555 a ns1:PoliticalFunction ; - ns1:beginning "2012-01-19"^^xsd:date ; - ns1:end "2014-04-02"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf18c223c3363102672b51b21c184056cc a ns1:PoliticalFunction ; - ns1:beginning "1991-10-10"^^xsd:date ; - ns1:end "1992-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf1ef5ada5f958b0aba640e490e460a221 a ns1:PoliticalFunction ; - ns1:beginning "2000-11-16"^^xsd:date ; - ns1:end "2002-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf2170f06ac1f449772b48a5e844b1f7cd a ns1:PoliticalFunction ; - ns1:beginning "1999-07-20"^^xsd:date ; - ns1:end "1999-07-21"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf232468ecbffeb758cc38b17eb407a0e8 a ns1:PoliticalFunction ; - ns1:beginning "1999-07-20"^^xsd:date ; - ns1:end "2004-07-19"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf27013a6352e31f1747cb0925a8d315a9 a ns1:PoliticalFunction ; - ns1:beginning "1992-01-15"^^xsd:date ; - ns1:end "1992-10-25"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf2bdc66e4eb24629cd4d221b54b45a3ca a ns1:PoliticalFunction ; - ns1:beginning "2009-09-15"^^xsd:date ; - ns1:end "2012-01-18"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf2cff760ac2aa89d9fb6a7c65e2055670 a ns1:PoliticalFunction ; - ns1:beginning "1989-07-26"^^xsd:date ; - ns1:end "1991-10-09"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf350caf782f08b96e2a3af35fc5616001 a ns1:PoliticalFunction ; - ns1:beginning "2009-07-21"^^xsd:date ; - ns1:end "2012-01-18"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf3520fdebb83d60437179f91787dfecf9 a ns1:PoliticalFunction ; - ns1:beginning "2007-01-31"^^xsd:date ; - ns1:end "2009-07-13"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf36596b04e9f7439a17fd0764130a0a16 a ns1:PoliticalFunction ; - ns1:beginning "1992-01-15"^^xsd:date ; - ns1:end "1994-05-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf399a6d922ff4c20a852fc22d6600906e a ns1:PoliticalFunction ; - ns1:beginning "2014-07-01"^^xsd:date ; - ns1:end "2017-03-31"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf4009f4a6d184651935c94eaf0e8e3b31 a ns1:PoliticalFunction ; - ns1:beginning "2007-01-15"^^xsd:date ; - ns1:end "2007-01-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf42381eeb3b04b2383aa048ad07ce1723 a ns1:PoliticalFunction ; - ns1:beginning "2009-07-14"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf46b57676499000ff10b62596ebaaff0a a ns1:PoliticalFunction ; - ns1:beginning "2002-01-17"^^xsd:date ; - ns1:end "2004-07-19"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf4e79d2756b56075d1ba8ead84562bd8b a ns1:PoliticalFunction ; - ns1:beginning "2009-07-16"^^xsd:date ; - ns1:end "2011-10-04"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf50f29b8e60bb70630a741c75d9e15850 a ns1:PoliticalFunction ; - ns1:beginning "2004-07-20"^^xsd:date ; - ns1:end "2004-07-20"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf52bab8d1a349df571ebeb55761db513c a ns1:PoliticalFunction ; - ns1:beginning "2012-01-19"^^xsd:date ; - ns1:end "2012-01-24"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf5520a69a514cdd1f889ff24e68b34128 a ns1:PoliticalFunction ; - ns1:beginning "2001-10-03"^^xsd:date ; - ns1:end "2004-07-19"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf57173b89fb6ac13826d452fc74067ff3 a ns1:PoliticalFunction ; - ns1:beginning "1989-07-26"^^xsd:date ; - ns1:end "1992-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf58aecb3742c3fe35d5080e16645b07c8 a ns1:PoliticalFunction ; - ns1:beginning "2011-03-09"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf5a2de253c92e0c12b84cac7c1e155666 a ns1:PoliticalFunction ; - ns1:beginning "2004-07-20"^^xsd:date ; - ns1:end "2009-07-13"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf6356d7122a0fb25ece5db75f61c19671 a ns1:PoliticalFunction ; - ns1:beginning "1989-07-26"^^xsd:date ; - ns1:end "1992-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf64ca0f3f6dde49e4a542560ed5122891 a ns1:PoliticalFunction ; - ns1:beginning "1992-10-26"^^xsd:date ; - ns1:end "1994-05-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf6579712af153fc6ce1b0db2de821f92d a ns1:PoliticalFunction ; - ns1:beginning "2004-09-14"^^xsd:date ; - ns1:end "2007-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf76dd5ad131d30ed3306a78709e7ce251 a ns1:PoliticalFunction ; - ns1:beginning "2006-04-27"^^xsd:date ; - ns1:end "2006-12-12"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf8240375d0714e974c78fb753d862122a a ns1:PoliticalFunction ; - ns1:beginning "2014-09-16"^^xsd:date ; - ns1:end "2014-11-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf870c0cf3766606d20cec0aecf1851569 a ns1:PoliticalFunction ; - ns1:beginning "1994-04-21"^^xsd:date ; - ns1:end "1994-05-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf8e8990b89425d1d5fefd0526973a771a a ns1:PoliticalFunction ; - ns1:beginning "2007-01-15"^^xsd:date ; - ns1:end "2007-01-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf8e9b064d23472496eb5097e9e61f664f a ns1:PoliticalFunction ; - ns1:beginning "1989-07-25"^^xsd:date ; - ns1:end "1994-05-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf99c4158caafcfba5bc9fda500f3d0c60 a ns1:PoliticalFunction ; - ns1:beginning "2012-01-19"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pf9d156e058e9b318c9b9df62e3a5ab42a a ns1:PoliticalFunction ; - ns1:beginning "2004-07-21"^^xsd:date ; - ns1:end "2007-01-14"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfb592b70d3ae90102ecf4f9cf0dbbf362 a ns1:PoliticalFunction ; - ns1:beginning "2010-01-07"^^xsd:date ; - ns1:end "2012-01-18"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfb8d606b79e721a9f0dd3232ebbe34ca4 a ns1:PoliticalFunction ; - ns1:beginning "2006-12-13"^^xsd:date ; - ns1:end "2009-07-13"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfba1033551486f93285992b1f262dc79e a ns1:PoliticalFunction ; - ns1:beginning "2009-07-14"^^xsd:date ; - ns1:end "2011-03-08"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfbb4b02d1622a4615842f6f422386345c a ns1:PoliticalFunction ; - ns1:beginning "2009-07-16"^^xsd:date ; - ns1:end "2012-01-18"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfc0aa2605dab67ed5619e571ffaa06fbc a ns1:PoliticalFunction ; - ns1:beginning "2007-01-31"^^xsd:date ; - ns1:end "2009-07-13"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfc4defbc2ab4682bb4ffda87d34391823 a ns1:PoliticalFunction ; - ns1:beginning "2002-01-17"^^xsd:date ; - ns1:end "2004-07-19"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfc8ae7a828c60553e27378b19a12a862a a ns1:PoliticalFunction ; - ns1:beginning "2014-07-01"^^xsd:date ; - ns1:end "2017-03-31"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfcedffd5e8d734d1203db8fa3f7038f78 a ns1:PoliticalFunction ; - ns1:beginning "2009-07-16"^^xsd:date ; - ns1:end "2011-02-06"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfd69065d8553a1a36e8aa0811ef83e0d3 a ns1:PoliticalFunction ; - ns1:beginning "1992-01-15"^^xsd:date ; - ns1:end "1994-05-11"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfd941468c3faf619ad5d0a0156420880c a ns1:PoliticalFunction ; - ns1:beginning "2007-01-31"^^xsd:date ; - ns1:end "2009-07-13"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfd9732399a8c6bbb7c5e041837244a2f8 a ns1:PoliticalFunction ; - ns1:beginning "2012-01-19"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfddd269fab021f83ba0a89efc4298e439 a ns1:PoliticalFunction ; - ns1:beginning "1999-09-09"^^xsd:date ; - ns1:end "1999-09-16"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfe33ac318848090cbc8bce7f6e4f26f6f a ns1:PoliticalFunction ; - ns1:beginning "1999-07-22"^^xsd:date ; - ns1:end "2001-10-02"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfe61b1fb146891c9d4d0bbf88349d10f8 a ns1:PoliticalFunction ; - ns1:beginning "2009-07-14"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pfe6b98484dd76e8c4c9e3e9783f2b0583 a ns1:PoliticalFunction ; - ns1:beginning "2011-02-07"^^xsd:date ; - ns1:end "2012-01-18"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pff4a2c78f09a9956180c64fe9dc17b9c1 a ns1:PoliticalFunction ; - ns1:beginning "2004-07-21"^^xsd:date ; - ns1:end "2006-04-26"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pff56829256d54fb49a895889d87467924 a ns1:PoliticalFunction ; - ns1:beginning "2014-07-01"^^xsd:date ; - ns1:end "2017-03-31"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pff75a3ca6e96a14db43823be10fb693d2 a ns1:PoliticalFunction ; - ns1:beginning "2009-07-14"^^xsd:date ; - ns1:end "2014-06-30"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pffa15145f38d5a44e3ef93a4534a2df63 a ns1:PoliticalFunction ; - ns1:beginning "2004-07-21"^^xsd:date ; - ns1:end "2006-04-26"^^xsd:date ; - ns1:institution ; - ns1:role . - -lp:pffeee7811d1a25e86cc33849665eeb484 a ns1:PoliticalFunction ; - ns1:beginning "1989-07-25"^^xsd:date ; - ns1:end "1994-04-20"^^xsd:date ; - ns1:institution ; - ns1:role . - - a lpv:EUParty ; - rdfs:label "Group for the Technical Coordination and Defence of Indipendent Groups and Members", - "TGI", - "Technical Coordination and Defence of Independent Groups and Members", - "Technical Group of Independent Members - mixed group" ; - lpv:acronym "TGI" ; - lpv:featuredRoleDescriptions "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Chair", - "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Member", - "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Member of the Bureau", - "Group for the Technical Coordination and Defence of Indipendent Groups and Members - Vice-Chair", - "Technical Coordination and Defence of Independent Groups and Members - Member", - "Technical Coordination and Defence of Independent Groups and Members - Member of the Bureau", - "Technical Group of Independent Members - mixed group - Co-Chair", - "Technical Group of Independent Members - mixed group - Member", - "Technical Group of Independent Members - mixed group - Member of the Bureau", - "Technical Group of Independent Members - mixed group - Treasurer" . - - a lpv:EUParty ; - rdfs:label "ECR", - "European Conservatives and Reformists Group" ; - lpv:acronym "ECR" ; - lpv:featuredRoleDescriptions "European Conservatives and Reformists Group -", - "European Conservatives and Reformists Group - Chair", - "European Conservatives and Reformists Group - Co-treasurer", - "European Conservatives and Reformists Group - Member", - "European Conservatives and Reformists Group - Member of the Bureau", - "European Conservatives and Reformists Group - Treasurer", - "European Conservatives and Reformists Group - Vice-Chair" . - - a lpv:EUParty ; - rdfs:label "NA", - "Non-attached", - "Non-attached Members" ; - lpv:acronym "NA" ; - lpv:featuredRoleDescriptions "Non-attached -", - "Non-attached - Member", - "Non-attached Members -", - "Non-attached Members - Member" . - From 900c077ff5d72aefb6a6d43057284a4d9eda56a0 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 17:11:49 +0100 Subject: [PATCH 22/39] update debugger conf --- .vscode/launch.json | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index d6dbcf45c..77e9b0cc4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "django: runserver", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", "args": ["runserver"], @@ -15,7 +15,7 @@ }, { "name": "django: shell", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", "args": ["shell"], @@ -24,16 +24,17 @@ }, { "name": "django: index", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", - "args": ["index", "${input:corpusName}"], + "args": ["index", "${input:corpusName}", "--delete"], "django": true, - "justMyCode": true + "justMyCode": true, + "console": "internalConsole" }, { "name": "django: loadcorpora", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", "args": ["loadcorpora"], From 17784019ede05521263e97a11a8bc6e9bc92de7d Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 14 Jan 2026 17:19:48 +0100 Subject: [PATCH 23/39] small fixes --- backend/corpora/parliament/euparl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 9ed61a642..44ec3f99c 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -196,11 +196,12 @@ def api_get_preflabel(url: str) -> Optional[str]: if response.status_code != 200: return None soup = BeautifulSoup(response.content, 'lxml-xml') - return soup.find('skos:preflabel', {'xml:lang': 'en'}).text + return soup.find('skos:prefLabel', {'xml:lang': 'en'}).text + @cache -def api_get_speaker_info(participant: str) -> dict: +def api_get_speaker_info(participant: str) -> Optional[Dict]: '''Query metadata about the speaker, unless it's already been queried before''' speaker_id = api_get_speaker_id(participant) speaker_url = _api_url(f'meps/{speaker_id}') From 620b00ca47e3e3be7caa2d30c6955f2312c22126 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 12:58:12 +0100 Subject: [PATCH 24/39] make max date configurable --- backend/corpora/parliament/euparl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 44ec3f99c..6ceea6f42 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -39,7 +39,7 @@ class ParliamentEurope(Parliament): description_page = 'euparl.md' image = 'euparl.jpeg' min_date = datetime(year=1999, month=7, day=20) - max_date = datetime.now() + max_date = getattr(settings, 'PP_EUPARL_MAX_DATE', datetime.now()) language_field = 'original_language_code' @property From 8c9fcaeb6b8612f41af0883106cf3dc523ee5473 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 13:23:02 +0100 Subject: [PATCH 25/39] update documentation --- .../corpora/parliament/description/euparl.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/backend/corpora/parliament/description/euparl.md b/backend/corpora/parliament/description/euparl.md index 4e592ed94..87984467b 100644 --- a/backend/corpora/parliament/description/euparl.md +++ b/backend/corpora/parliament/description/euparl.md @@ -1,5 +1,31 @@ +Debates from the European Parliament (EP). As the parliament of the European Union, the EP has representatives from all member states. Members are organised into political groups, which are broad alliances of national parties. + +## Source data + +The European Parliament corpus in People & Parliament is based two datasets: [EUPDCorp](https://doi.org/10.5281/zenodo.15056399) (CC-BY 4.0 International licence) is used for debates from 1999 to July 2024. + +Debates from July 2024 to January 2026 are sourced from the [European Parliament Open Data API](https://data.europarl.europa.eu/en/developer-corner/opendata-api) (CC-BY 4.0 International licence). + +**References:** + +- Mochtak, Michal (2025): Corpus of the EU Parliament Debates (EUPDCorp), 1999-2024, Zenodo, v1.0, https://doi.org/10.5281/zenodo.15056399 + The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017. +## Notes + +### Language and translations + +Speakers in the European Parliament use a large number of different languages. The documents in the this corpus include both the original speech, and an English translation. + +Translations are taken from the source data. Translations in the EUPDCorp dataset are machine-translated. + +### Parties + +The _party_ field specifies the political group of the speaker in the European Parliament. In addition, the _national party_ field specifies the speaker's party at the national level. + +Because the data is extracted from multiple datasets, it is possible that the corpus uses a different name for the same political group before and after 2024. + ### Image attribution The image used for this corpus was created by Ash Crow ([image source](https://commons.wikimedia.org/wiki/File:European_Parliament_-_Hemicycle.jpg)) and is licensed under a [Creative Commons Attribution-Share Alike 3.0 licence](https://creativecommons.org/licenses/by-sa/3.0/deed.en). From 4cafa496fc4362b5b67d585a07dc35d1b4b95f08 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 13:40:03 +0100 Subject: [PATCH 26/39] fix nan values --- backend/corpora/parliament/euparl.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 6ceea6f42..65eff9cf9 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -4,6 +4,8 @@ import os from typing import Optional, Dict, List, Tuple, Union from urllib import parse +import math +import numbers from bs4 import BeautifulSoup from django.conf import settings @@ -541,7 +543,7 @@ def iterate_data(self, data: Dict, metadata): def _to_int(value) -> Optional[int]: - if value or value == 0: + if value or value == 0 and not math.isnan(value): return int(value) def _format_name(values) -> str: @@ -549,6 +551,17 @@ def _format_name(values) -> str: value for value in filter(None, values) ) +def _nan_to_none(value): + if not isinstance(value, numbers.Number) or not math.isnan(value): + return value + +def _format_gender(value): + if value == 1: + return 'Male' + if value == 0: + return 'Female' + + class EUPDCorpReader(RDSReader): data_directory = settings.PP_EUPARL_DATA @@ -583,15 +596,15 @@ def sources(self, **kwargs): ), Field( name='party', - extractor=CSV('epg_short'), + extractor=CSV('epg_short', transform=_nan_to_none), ), Field( name='party_full', - extractor=CSV('epg_long'), + extractor=CSV('epg_long', transform=_nan_to_none), ), Field( name='party_national', - extractor=CSV('party_name'), + extractor=CSV('party_name', transform=_nan_to_none), ), Field( name='sequence', @@ -622,10 +635,7 @@ def sources(self, **kwargs): ), Field( name='speaker_gender', - extractor=CSV( - 'gender', - transform=lambda value: 'Male' if value else 'Female', - ) + extractor=CSV('gender', transform=_format_gender) ), Field( name='speaker_birth_year', @@ -633,7 +643,7 @@ def sources(self, **kwargs): ), Field( name='speaker_country', - extractor=CSV('nationality'), + extractor=CSV('nationality', transform=_nan_to_none), ), Field( name='source_archive', From f0c03a94576fc96b0eb582802dfee18de8f10b1e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 14:10:33 +0100 Subject: [PATCH 27/39] fix operator order --- backend/corpora/parliament/euparl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 65eff9cf9..07dce59e7 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -543,7 +543,7 @@ def iterate_data(self, data: Dict, metadata): def _to_int(value) -> Optional[int]: - if value or value == 0 and not math.isnan(value): + if (value or value == 0) and not math.isnan(value): return int(value) def _format_name(values) -> str: From 5eec0a5dc5b2d85a08925dd74321900a5d5134c9 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 14:57:40 +0100 Subject: [PATCH 28/39] fix mapping for source archive field --- backend/corpora/parliament/euparl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 07dce59e7..93525380c 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -127,6 +127,7 @@ def source2dicts(self, source, **kwargs): name='source_archive', display_name='Source archive', description='Source dataset for this document', + es_mapping=keyword_mapping(), ) def __init__(self): From 7147f45d03754f94c5c59ec7b0a06c31b43570fd Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 15:18:10 +0100 Subject: [PATCH 29/39] update subcorpus dates --- backend/corpora/parliament/euparl.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 93525380c..485f84f26 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -338,11 +338,12 @@ def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs): class ParliamentEuropeFromAPI(JSONReader): """ - Speeches of the European parliament, originally in or translated to English, - provided through the Europarl Open Data API + Reader to extract speeches from the Europarl Open Data API + + Extracts from 9/2/2024 until the present. """ - min_date = datetime(year=2024, month=7, day=7) + min_date = datetime(year=2024, month=2, day=9) max_date = datetime.now() record_path = ['data', 'recorded_in_a_realization_of'] @@ -564,6 +565,10 @@ def _format_gender(value): class EUPDCorpReader(RDSReader): + ''' + Reader for the EUPDCorp dataset. Contains debates from 20/7/1999 to 8/2/2024 + ''' + data_directory = settings.PP_EUPARL_DATA def sources(self, **kwargs): From cd855f55566b7313b9334c32e9f24e5240c7a669 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 22 Jan 2026 15:27:41 +0100 Subject: [PATCH 30/39] field presentation --- backend/corpora/parliament/euparl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 485f84f26..45bc42d8c 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -80,6 +80,8 @@ def source2dicts(self, source, **kwargs): name='party_national', display_name='National party', es_mapping=keyword_mapping(enable_full_text_search=True), + search_filter=MultipleChoiceFilter(), + visualizations=['resultscount', 'termfrequency'], ) sequence = field_defaults.sequence() original_language = field_defaults.language() @@ -118,6 +120,7 @@ def source2dicts(self, source, **kwargs): display_name='Original speech', description='Speech in the original language', es_mapping=main_content_mapping(), + search_field_core=True, display_type='text_content', language='dynamic', ) @@ -135,8 +138,6 @@ def __init__(self): self.date, self.debate_id, self.debate_title, - self.original_language, - self.original_language_code, self.party, self.party_full, self.party_id, @@ -149,6 +150,8 @@ def __init__(self): self.speaker_id, self.speech, self.speech_original, + self.original_language, + self.original_language_code, self.speech_id, self.source_archive, ] From d4941d32bb8bef20114dc656fe2a10806dbd6083 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Mon, 26 Jan 2026 14:16:22 +0100 Subject: [PATCH 31/39] fix missing keys error --- backend/corpora/parliament/euparl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 45bc42d8c..ea095919f 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -368,7 +368,7 @@ def sources(self, **kwargs): ) if response.status_code != 200: continue - meeting_data = response.json().get('data') + meeting_data = response.json().get('data', []) metadata = {'date': formatted_date} for event in meeting_data: if event.get("had_activity_type") != "def/ep-activities/PLENARY_DEBATE": @@ -378,7 +378,7 @@ def sources(self, **kwargs): sequence_in_debate = 0 - for speech in event.get('consists_of'): + for speech in event.get('consists_of', []): speech_id = speech.split("/")[-1] speech_url = _api_url(f'speeches/{speech_id}', {'include-output': 'xml_fragment'}) speech_response = requests.get(speech_url) From c615787434ea18aebc55480b92eed92ffdb581a8 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Tue, 27 Jan 2026 15:41:57 +0100 Subject: [PATCH 32/39] add log statement --- backend/corpora/parliament/euparl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index ea095919f..4b9c75052 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -362,6 +362,7 @@ def sources(self, **kwargs): formatted_date = date.strftime('%Y-%m-%d') meeting_id = f'MTG-PL-{formatted_date}' meeting_url = _api_url(f'meetings/{meeting_id}/activities') + logger.info(f'Meeting URL: {meeting_url}') response = requests.get( meeting_url, headers={'accept': 'application/ld+json'}, From d7fbea410cefa36f3a4fc22e1de262da7e6785f3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 12:25:06 +0100 Subject: [PATCH 33/39] get sequence from data --- backend/corpora/parliament/conftest.py | 2 +- backend/corpora/parliament/euparl.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index fa23aee59..0ddc5ec89 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -731,7 +731,7 @@ def parliament_corpora_settings(settings): 'party_national': None, "party_id": "7037", 'original_language': "English", - "sequence": 1, + "sequence": 321, "speaker": "Roberts Zīle", "speaker_country": "Latvia", 'speaker_gender': 'Male', diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 4b9c75052..1fa8a92eb 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -377,16 +377,12 @@ def sources(self, **kwargs): metadata['debate_id'] = event.get('activity_id') metadata['debate_title'] = event.get('activity_label').get('en') - sequence_in_debate = 0 - for speech in event.get('consists_of', []): speech_id = speech.split("/")[-1] speech_url = _api_url(f'speeches/{speech_id}', {'include-output': 'xml_fragment'}) speech_response = requests.get(speech_url) if speech_response.status_code != 200: continue - sequence_in_debate += 1 - metadata['sequence'] = sequence_in_debate yield speech_response, metadata def iterate_data(self, data: Dict, metadata): @@ -461,7 +457,7 @@ def iterate_data(self, data: Dict, metadata): ), Field( name='sequence', - extractor=Metadata('sequence') + extractor=JSON('numbering') ), Field( name='original_language', From a9c556af6cec7b6d5d9b24aa2d985998363d2784 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 15:43:19 +0100 Subject: [PATCH 34/39] updated speech loop --- backend/corpora/parliament/conftest.py | 2 +- backend/corpora/parliament/euparl.py | 100 ++++++++++++++++--------- 2 files changed, 66 insertions(+), 36 deletions(-) diff --git a/backend/corpora/parliament/conftest.py b/backend/corpora/parliament/conftest.py index 0ddc5ec89..c71476e91 100644 --- a/backend/corpora/parliament/conftest.py +++ b/backend/corpora/parliament/conftest.py @@ -724,7 +724,7 @@ def parliament_corpora_settings(settings): { "date": "2024-11-13", "debate_id": "MTG-PL-2024-11-13-PVCRE-ITM-17", - "debate_title": "17. Fight against money laundering and terrorist financing: listing Russia as a high-risk third country in the EU (debate)", + "debate_title": "Fight against money laundering and terrorist financing: listing Russia as a high-risk third country in the EU (debate)", "id": "MTG-PL-2024-11-13-OTH-2017005042457", "party": 'ECR', 'party_full': 'European Conservatives and Reformists', diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 1fa8a92eb..94fa79b4a 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -322,11 +322,19 @@ def _api_get_original_speech(data): _, code = _api_get_language_data(data['originalLanguage'][0]) return data.get(_api_speech_key(code)) +def _api_get_meeting_id(value: List[str]): + path = first(value) + if path: + return path.split('/')[-1] def first(values): if len(values): return values[0] +def parse_date(value): + return value + + class _JSON(JSON): ''' Edited JSON extractor that also accepts 0 keys to return the object as-is @@ -353,37 +361,53 @@ class ParliamentEuropeFromAPI(JSONReader): meta = [ ['data', 'had_participation', 'had_participant_person'], ['data', 'activity_id'], + ['data', 'activity_date'], + ['data', 'activity_label'], + ['data', 'inverse_consists_of'], ] def sources(self, **kwargs): - date = self.min_date - while date < self.max_date: - date += timedelta(days=1) - formatted_date = date.strftime('%Y-%m-%d') - meeting_id = f'MTG-PL-{formatted_date}' - meeting_url = _api_url(f'meetings/{meeting_id}/activities') - logger.info(f'Meeting URL: {meeting_url}') - response = requests.get( - meeting_url, - headers={'accept': 'application/ld+json'}, - ) - if response.status_code != 200: - continue - meeting_data = response.json().get('data', []) - metadata = {'date': formatted_date} - for event in meeting_data: - if event.get("had_activity_type") != "def/ep-activities/PLENARY_DEBATE": - continue - metadata['debate_id'] = event.get('activity_id') - metadata['debate_title'] = event.get('activity_label').get('en') - - for speech in event.get('consists_of', []): - speech_id = speech.split("/")[-1] - speech_url = _api_url(f'speeches/{speech_id}', {'include-output': 'xml_fragment'}) - speech_response = requests.get(speech_url) - if speech_response.status_code != 200: - continue - yield speech_response, metadata + start_date = self.min_date + end_date = self.max_date + + format_date = lambda date: date.strftime('%Y-%m-%d') + + offset = 0 + limit = 50 + done = False + + while not done: + url = _api_url('speeches', { + 'activity-type': 'PLENARY_DEBATE_SPEECH', + 'sitting-date': format_date(start_date), + 'sitting-date-end': format_date(end_date), + 'include-output': 'xml_fragment', + 'sort-by': 'sitting-date:asc,numbering:asc', + 'offset': offset, + 'limit': limit, + }) + logger.info(url) + response = requests.get(url) + data = response.json() + total = data.get('meta').get('total') + + if total == 10000: + # if results are capped, use a shorter range + end_date -= (end_date - start_date) / 2 + else: + yield response, {} + + if offset + limit < total: + # if we're not through results yet, move to the next page + offset += limit + elif end_date < self.max_date: + # if we're through the results and the timeframe was limited, move it up + start_date = end_date + timedelta(days=1) + end_date = self.max_date + offset = 0 + else: + done = True + def iterate_data(self, data: Dict, metadata): speeches_with_speaker = [ @@ -398,18 +422,24 @@ def iterate_data(self, data: Dict, metadata): ] return filtered_records + + _date_extractor = JSON( + 'data.activity_date', + transform=parse_date + ) + fields = [ Field( name='debate_id', - extractor=Metadata('debate_id'), + extractor=JSON('data.inverse_consists_of', transform=_api_get_meeting_id), ), Field( name='debate_title', - extractor=Metadata('debate_title'), + extractor=JSON('data.activity_label', 'en'), ), Field( name='date', - extractor=Metadata('date') + extractor=_date_extractor ), Field( name='party', @@ -418,7 +448,7 @@ def iterate_data(self, data: Dict, metadata): "data.had_participation.had_participant_person", transform=first, ), - Metadata('date'), + _date_extractor, transform=api_get_party_name, ) ), @@ -429,7 +459,7 @@ def iterate_data(self, data: Dict, metadata): "data.had_participation.had_participant_person", transform=first, ), - Metadata('date'), + _date_extractor, transform=_api_get_party_full_name, ) ), @@ -440,7 +470,7 @@ def iterate_data(self, data: Dict, metadata): "data.had_participation.had_participant_person", transform=first ), - Metadata('date'), + _date_extractor, transform=api_get_party_id, ) ), @@ -451,7 +481,7 @@ def iterate_data(self, data: Dict, metadata): "data.had_participation.had_participant_person", transform=first ), - Metadata('date'), + _date_extractor, transform=_api_get_national_party_name, ) ), From 21d02fad58346f90105e6e8d6e00b5d231de5c52 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 16:15:55 +0100 Subject: [PATCH 35/39] handle empty debate ids + add comments/docstrings --- backend/corpora/parliament/euparl.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 94fa79b4a..5e48a60c7 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -351,7 +351,10 @@ class ParliamentEuropeFromAPI(JSONReader): """ Reader to extract speeches from the Europarl Open Data API - Extracts from 9/2/2024 until the present. + See https://data.europarl.europa.eu/en/developer-corner/opendata-api for API + documentation. + + Extracts from 9/2/2024 (the end of EUPDCorp) until the present. """ min_date = datetime(year=2024, month=2, day=9) @@ -367,6 +370,10 @@ class ParliamentEuropeFromAPI(JSONReader): ] def sources(self, **kwargs): + # Uses the /speeches endpoint to request paginated speeches in the date range. + # Limitation is that the iteration is capped at 10.000 speeches, so the interval + # is dynamically broken up into smaller time intervals as neede. + start_date = self.min_date end_date = self.max_date @@ -410,12 +417,19 @@ def sources(self, **kwargs): def iterate_data(self, data: Dict, metadata): - speeches_with_speaker = [ - item for item in data['data'] + # overrides the JSON data iterator to do some manipulations on the data + # to ensure validity + + # add empty debate ID if key does not exist... + ensure_debate_id = lambda item: { 'inverse_consists_of': [] } | item + # ... & filter speeches without speaker metadata + complete_speeches = [ + ensure_debate_id(item) for item in data['data'] if 'had_participation' in item ] - filtered_data = data | { 'data': speeches_with_speaker} + filtered_data = data | {'data': complete_speeches} records = list(super().iterate_data(filtered_data, metadata)) + # filter records without transcription data filtered_records = [ record for record in records if record['data'].get(_api_speech_key('en')) From b26670ada0bb4529ced25ca51e306cf3f8c33c0c Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 16:45:21 +0100 Subject: [PATCH 36/39] code clarity --- backend/corpora/parliament/euparl.py | 79 ++++++++++++---------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index 5e48a60c7..f23489a55 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -331,8 +331,8 @@ def first(values): if len(values): return values[0] -def parse_date(value): - return value +def _api_format_date(value: datetime): + return value.strftime('%Y-%m-%d') class _JSON(JSON): @@ -369,51 +369,41 @@ class ParliamentEuropeFromAPI(JSONReader): ['data', 'inverse_consists_of'], ] - def sources(self, **kwargs): + def sources(self, start_date=min_date, end_date=max_date, offset=0, **kwargs): # Uses the /speeches endpoint to request paginated speeches in the date range. - # Limitation is that the iteration is capped at 10.000 speeches, so the interval - # is dynamically broken up into smaller time intervals as neede. - - start_date = self.min_date - end_date = self.max_date - - format_date = lambda date: date.strftime('%Y-%m-%d') + # Limitation is that the iteration is capped at 10.000 speeches, so the time + # interval is dynamically broken up into smaller time intervals as needed. + # Implicitly assumes that start_date < end_date, and that there are never more + # than 10.000 speeches in a single day. - offset = 0 limit = 50 - done = False - - while not done: - url = _api_url('speeches', { - 'activity-type': 'PLENARY_DEBATE_SPEECH', - 'sitting-date': format_date(start_date), - 'sitting-date-end': format_date(end_date), - 'include-output': 'xml_fragment', - 'sort-by': 'sitting-date:asc,numbering:asc', - 'offset': offset, - 'limit': limit, - }) - logger.info(url) - response = requests.get(url) - data = response.json() - total = data.get('meta').get('total') - - if total == 10000: - # if results are capped, use a shorter range - end_date -= (end_date - start_date) / 2 - else: - yield response, {} - - if offset + limit < total: - # if we're not through results yet, move to the next page - offset += limit - elif end_date < self.max_date: - # if we're through the results and the timeframe was limited, move it up - start_date = end_date + timedelta(days=1) - end_date = self.max_date - offset = 0 - else: - done = True + url = _api_url('speeches', { + 'activity-type': 'PLENARY_DEBATE_SPEECH', + 'sitting-date': _api_format_date(start_date), + 'sitting-date-end': _api_format_date(end_date), + 'include-output': 'xml_fragment', + 'sort-by': 'sitting-date:asc,numbering:asc', + 'offset': offset, + 'limit': limit, + }) + logger.info(url) + response = requests.get(url) + data = response.json() + total = data.get('meta').get('total') + + if total == 10000: + # if results are capped, use a shorter range + split_end = end_date - ((end_date - start_date) / 2) + yield from self.sources(start_date, split_end, 0) + split_start = end_date + timedelta(days=1) + yield from self.sources(split_start, end_date) + else: + yield response, {} + + if offset + limit < total: + # if we're not through results yet, move to the next page + next_offset = offset + limit + yield from self.sources(start_date, end_date, next_offset) def iterate_data(self, data: Dict, metadata): @@ -439,7 +429,6 @@ def iterate_data(self, data: Dict, metadata): _date_extractor = JSON( 'data.activity_date', - transform=parse_date ) fields = [ From d110125135d2c6257ec9e8929831ff086b24ad28 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 17:05:18 +0100 Subject: [PATCH 37/39] update documentation --- backend/corpora/parliament/description/euparl.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/backend/corpora/parliament/description/euparl.md b/backend/corpora/parliament/description/euparl.md index 87984467b..1a5fc6294 100644 --- a/backend/corpora/parliament/description/euparl.md +++ b/backend/corpora/parliament/description/euparl.md @@ -2,29 +2,27 @@ Debates from the European Parliament (EP). As the parliament of the European Uni ## Source data -The European Parliament corpus in People & Parliament is based two datasets: [EUPDCorp](https://doi.org/10.5281/zenodo.15056399) (CC-BY 4.0 International licence) is used for debates from 1999 to July 2024. +The European Parliament corpus in People & Parliament is based two datasets: [EUPDCorp](https://doi.org/10.5281/zenodo.15056399) (CC-BY 4.0 International licence) is used for debates from 1999 to February 2024 (terms 5-9). -Debates from July 2024 to January 2026 are sourced from the [European Parliament Open Data API](https://data.europarl.europa.eu/en/developer-corner/opendata-api) (CC-BY 4.0 International licence). +Debates from February 2024 to January 2026 are sourced from the [European Parliament Open Data API](https://data.europarl.europa.eu/en/developer-corner/opendata-api) (CC-BY 4.0 International licence). **References:** - Mochtak, Michal (2025): Corpus of the EU Parliament Debates (EUPDCorp), 1999-2024, Zenodo, v1.0, https://doi.org/10.5281/zenodo.15056399 -The debates from the European Parliament, in English (translation), as provided by the [Talk of Europe](https://ssh.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-x62-ew3m&version=1.0) dataset. The dataset covers debates from July 1999 to July 2017. - ## Notes ### Language and translations Speakers in the European Parliament use a large number of different languages. The documents in the this corpus include both the original speech, and an English translation. -Translations are taken from the source data. Translations in the EUPDCorp dataset are machine-translated. +Translations are taken from the source datasets (see above). Translations in the EUPDCorp dataset are machine-translated. ### Parties The _party_ field specifies the political group of the speaker in the European Parliament. In addition, the _national party_ field specifies the speaker's party at the national level. -Because the data is extracted from multiple datasets, it is possible that the corpus uses a different name for the same political group before and after 2024. +Because the data is extracted from multiple datasets, the corpus may use a different name for the same political group before and after 2024. ### Image attribution From b4357658e7640c3e6fc67e96e83908d1f6ffbda3 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 17:06:17 +0100 Subject: [PATCH 38/39] remove unused test file --- .../data/euparl/api/MeetingResponse.json | 58 ------------------- 1 file changed, 58 deletions(-) delete mode 100644 backend/corpora/parliament/tests/data/euparl/api/MeetingResponse.json diff --git a/backend/corpora/parliament/tests/data/euparl/api/MeetingResponse.json b/backend/corpora/parliament/tests/data/euparl/api/MeetingResponse.json deleted file mode 100644 index 313e8391b..000000000 --- a/backend/corpora/parliament/tests/data/euparl/api/MeetingResponse.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "data": [ - { - "id": "eli/dl/event/MTG-PL-2024-11-13-PVCRE-ITM-17", - "type": "Activity", - "activity_date": "2024-11-13", - "activity_id": "MTG-PL-2024-11-13-PVCRE-ITM-17", - "activity_label": { - "pt": "17. Combate ao branqueamento de capitais e ao financiamento do terrorismo: aditamento da Rússia à lista de países terceiros de risco elevado da UE (debate)", - "mt": "17. Il-ġlieda kontra l-ħasil tal-flus u l-finanzjament tat-terroriżmu: l-elenkar tar-Russja bħala pajjiż terz b’riskju għoli fl-UE (dibattitu)", - "es": "17. Lucha contra el blanqueo de capitales y la financiación del terrorismo: inclusión de Rusia en la lista de la Unión de terceros países de alto riesgo (debate)", - "hr": "17. Borba protiv pranja novca i financiranja terorizma: uvrštavanje Rusije na popis visokorizičnih trećih zemalja u EU-u (rasprava)", - "fi": "17. Rahanpesun ja terrorismin rahoituksen torjuminen: Venäjän luokitteleminen EU:ssa suuririskiseksi kolmanneksi maaksi (keskustelu)", - "et": "17. Rahapesu ja terrorismi rahastamise vastane võitlus: Venemaa kandmine suure riskiga kolmandate riikide ELi loetellu (arutelu)", - "pl": "17. Walka z praniem pieniędzy i finansowaniem terroryzmu: wpisanie Rosji na listę państw trzecich wysokiego ryzyka w UE (debata)", - "sv": "17. Bekämpning av penningtvätt och finansiering av terrorism: uppförande av Ryssland på förteckningen över högrisktredjeländer i EU (debatt)", - "cs": "17. Boj proti praní peněz a financování terorismu: zařazení Ruska na seznam vysoce rizikových třetích zemí pro EU (rozprava)", - "de": "17. Bekämpfung von Geldwäsche und Terrorismusfinanzierung: Aufnahme Russlands in die Liste der Drittländer mit hohem Risiko (Aussprache)", - "ga": "17. An comhrac i gcoinne sciúradh airgid agus maoiniú sceimhlitheoireachta: an Rúis a liostú san Aontas mar thríú tír ardriosca (díospóireacht)", - "el": "17. Καταπολέμηση της νομιμοποίησης εσόδων από παράνομες δραστηριότητες και της χρηματοδότησης της τρομοκρατίας: καταχώριση της Ρωσίας ως τρίτης χώρας υψηλού κινδύνου στην ΕΕ (συζήτηση)", - "sl": "17. Boj proti pranju denarja in financiranju terorizma: uvrstitev Rusije na seznam EU tretjih držav z visokim tveganjem (razprava)", - "sk": "17. Boj proti praniu špinavých peňazí a financovaniu terorizmu: zaradenie Ruska do zoznamu vysokorizikových tretích krajín pre EÚ (rozprava)", - "it": "17. Lotta al riciclaggio e al finanziamento del terrorismo: inserire la Russia nell'elenco dei paesi terzi ad alto rischio dell'UE (discussione)", - "fr": "17. Lutte contre le blanchiment de capitaux et le financement du terrorisme: inscription de la Russie sur la liste des pays tiers à haut risque de l'UE (débat)", - "ro": "17. Combaterea spălării banilor și a finanțării terorismului: includerea Rusiei printre țările terțe cu grad de risc ridicat pentru UE (dezbatere)", - "nl": "17. Strijd tegen witwassen en terrorismefinanciering: opname van Rusland in de EU-lijst van derde landen met een hoog risico (debat)", - "lt": "17. Kova su pinigų plovimu ir terorizmo finansavimu: Rusijos įtraukimas į ES sudarytą didelės rizikos trečiųjų valstybių sąrašą (diskusijos)", - "hu": "17. A pénzmosás és a terrorizmus finanszírozása elleni küzdelem: Oroszország kiemelt kockázatot jelentő harmadik országként való uniós jegyzékbe vétele (vita)", - "bg": "17. Борба с изпирането на пари и финансирането на тероризма: включване на Русия в списъка на ЕС на високорисковите трети страни (разискване)", - "lv": "17. Cīņa pret nelikumīgi iegūtu līdzekļu legalizāciju un terorisma finansēšanu: Krievijas iekļaušana ES izveidotajā augsta riska trešo valstu sarakstā (debates)", - "da": "17. Bekæmpelse af hvidvask af penge og af finansiering af terrorisme: opførelse af Rusland på listen over højrisikotredjelande i EU (forhandling)", - "en": "17. Fight against money laundering and terrorist financing: listing Russia as a high-risk third country in the EU (debate)" - }, - "consists_of": [ - "eli/dl/event/MTG-PL-2024-11-13-OTH-2017005042457" - ], - "executed": [ - "eli/dl/event/MTG-PL-2024-11-13-OJ-ITM-D-16" - ], - "had_activity_type": "def/ep-activities/PLENARY_DEBATE", - "recorded_in_a_realization_of": [ - "eli/dl/doc/PV-10-2024-11-13-ITM-017", - "eli/dl/doc/CRE-10-2024-11-13-ITM-017" - ], - "inverse_consists_of": [ - "eli/dl/event/MTG-PL-2024-11-13", - "eli/dl/proc/2024-2905" - ] - } - ], - "@context": [ - { - "data": "@graph", - "@base": "https://data.europarl.europa.eu/" - }, - "https://data.europarl.europa.eu/api/v2/context.jsonld" - ] -} From a10ad0816e8c6aaf883383ecf3dff83101e5d43e Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Wed, 28 Jan 2026 18:45:23 +0100 Subject: [PATCH 39/39] fix data splicing --- backend/corpora/parliament/euparl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/corpora/parliament/euparl.py b/backend/corpora/parliament/euparl.py index f23489a55..b70e54dda 100644 --- a/backend/corpora/parliament/euparl.py +++ b/backend/corpora/parliament/euparl.py @@ -395,7 +395,7 @@ def sources(self, start_date=min_date, end_date=max_date, offset=0, **kwargs): # if results are capped, use a shorter range split_end = end_date - ((end_date - start_date) / 2) yield from self.sources(start_date, split_end, 0) - split_start = end_date + timedelta(days=1) + split_start = split_end + timedelta(days=1) yield from self.sources(split_start, end_date) else: yield response, {}