diff --git a/.vscode/launch.json b/.vscode/launch.json index d6dbcf45c..7bdde5702 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,7 +6,7 @@ "configurations": [ { "name": "django: runserver", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", "args": ["runserver"], @@ -15,7 +15,7 @@ }, { "name": "django: shell", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", "args": ["shell"], @@ -24,10 +24,19 @@ }, { "name": "django: index", - "type": "python", + "type": "debugpy", "request": "launch", "program": "${workspaceFolder}/backend/manage.py", - "args": ["index", "${input:corpusName}"], + "args": ["index", "${input:corpusName}", "-d"], + "django": true, + "justMyCode": true + }, + { + "name": "django: loadcorpora", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/backend/manage.py", + "args": ["loadcorpora"], "django": true, "justMyCode": true }, diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py index 690920c6c..c46bc9f7d 100644 --- a/backend/addcorpus/es_settings.py +++ b/backend/addcorpus/es_settings.py @@ -23,32 +23,42 @@ def get_language_key(language_code): return Language.make(standardize_tag(language_code)).display_name().lower() -def _stopwords_directory() -> str: - stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords') - if not os.path.exists(stopwords_dir): +def _nltk_stopwords_directory() -> str: + nltk_stopwords_dir = os.path.join(settings.NLTK_DATA_PATH, 'corpora', 'stopwords') + if not os.path.exists(nltk_stopwords_dir): nltk.download('stopwords', settings.NLTK_DATA_PATH) - return stopwords_dir + return nltk_stopwords_dir -def _stopwords_path(language_code: str): - dir = _stopwords_directory() +def _nltk_stopwords_path(language_code: str): + dir = _nltk_stopwords_directory() + language = get_language_key(language_code) + return os.path.join(dir, language) + +def _supplementary_path(language_code: str): + dir = os.path.join(settings.BASE_DIR, 'addcorpus', 'stopword_data', 'supplementary_data') language = get_language_key(language_code) return os.path.join(dir, language) def stopwords_available(language_code: str) -> bool: if not language_code: return False - path = _stopwords_path(language_code) - return os.path.exists(path) - -def get_nltk_stopwords(language_code): - path = _stopwords_path(language_code) - - if os.path.exists(path): - with open(path) as infile: + nltk_path = _nltk_stopwords_path(language_code) + supplementary_path = _supplementary_path(language_code) + return True if (os.path.exists(nltk_path) or os.path.exists(supplementary_path)) else False + +def get_stopwords(language_code): + nltk_path = _nltk_stopwords_path(language_code) + supplementary_path = _supplementary_path(language_code) + if os.path.exists(nltk_path): + with open(nltk_path) as infile: + words = [line.strip() for line in infile.readlines()] + return words + elif os.path.exists(supplementary_path): + with open(supplementary_path) as infile: words = [line.strip() for line in infile.readlines()] return words else: - raise NotImplementedError('language {} has no nltk stopwords list'.format(language_code)) + raise NotImplementedError('language {} has no stopwords list'.format(language_code)) def add_language_string(name, language): return '{}_{}'.format(name, language) if language else name @@ -87,6 +97,8 @@ def es_settings(languages=[], stopword_analysis=False, stemming_analysis=False): if stopword_analysis or stemming_analysis: if not set_stopword_filter(settings, add_language_string(stopword_filter_name, language), language): + warnings.warn('You specified `stopword_analysis=True`, but \ + there are no stopwords available for this language') continue # skip languages for which we do not have a stopword list if stopword_analysis: @@ -119,7 +131,7 @@ def number_filter(): def make_stopword_filter(language): try: - stopwords = get_nltk_stopwords(language) + stopwords = get_stopwords(language) return { "type": "stop", 'stopwords': stopwords diff --git a/backend/addcorpus/stopword_data/supplementary_data/README.md b/backend/addcorpus/stopword_data/supplementary_data/README.md new file mode 100644 index 000000000..96c30ea1d --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/README.md @@ -0,0 +1,12 @@ +## Supplementary Data Sources +Source 1: For Bulgarian, Czech, Croatian, Galician, Latvian, and Ukrainian, stopword lists were downloaded from this [Github repository](https://github.com/negapedia/nltk/tree/master/corpora/stopwords), by [Marco Chilese](https://github.com/MarcoChilese). The stopword lists are a combination of nltk stopwords (where available) and stopwords from [ranks.nl](https://www.ranks.nl/stopwords/). They were downloaded on 2025-12-18. + +Source 2: For Bosnian stopwords, the following publication was used: Sead Jahić, & Jernej Vičič. (2023). Lists of stopwords, polarity shifters and AnAwords of Bosnian language [Data set]. Zenodo. https://doi.org/10.5281/zenodo.10373141 + +Source 3: For Estonian stopwords, the following Github repository was used: https://github.com/stopwords-iso/stopwords-et?tab=readme-ov-file. + +Source 4: For Icelandic stopwords, the following Github repository was used: https://github.com/ViktorMS/stoppord/blob/master/stoppord.csv + +Source 5: For Serbian stopwords, the following Github repository was used: https://github.com/Xangis/extra-stopwords/blob/master/serbian + +Source 6: For Slovenian stopwords, the following Github repository was used: https://github.com/stopwords-iso/stopwords-sl/blob/master/raw/gh-stopwords-json-sl.txt diff --git a/backend/addcorpus/stopword_data/supplementary_data/bosnian b/backend/addcorpus/stopword_data/supplementary_data/bosnian new file mode 100644 index 000000000..f7078b4f7 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/bosnian @@ -0,0 +1,395 @@ +a +aha +ako +al +ali +apr +april +aug +august +avg +avgust +b +bez +bi +bil +bila +bili +biti +blizu +budem +budemo +budeš +budu +c +cjelina +crn +crno +č +čak +često +čet +četiri +četv +četvr +četvrtak +četvrto +čitav +ć +će +d +da +dakle +dalje +dan +danas +dapače +datum +dec +decembar +deset +deseti +desetih +devedeset +devet +deveti +devetica +devetina +devetsto +dnevno +dns +do +doduše +dok +donedavno +dotle +dva +dvaput +dvica +dvije +dvojica +dvojka +dž +đ +e +eto +evo +f +faktički +feb +febr +februar +g +gde +gdje +gđa +gđica +god +gosp +gospođa +gospođica +h +ha +hajati +halo +haman +hiljada +hoće +hoćemo +hoćes +hoću +i +iako +idr +igdje +ii +iii +ikad +ikada +ikako +iko +ikoji +ili +im +inače +ionako +ipak +isprva +isti +isto +itd +iv +ix +iz +iza +j +ja +jan +januar +je +jedan +jedinica +jedino +jedna +jednina +jedno +jednom +jer +jesam +jesi +jesmo +jeste +jesu +jučer +jul +juli +jun +juni +jutros +k +kad +kada +kako +kamo +kao +ko +koji +koliko +kome +kud +kuda +l +lj +m +maj +maltene +mar +mart +me +međutim +mene +mi +milijarda +milijun +milion +mjesec +mn +mnogi +mogu +moj +moram +moraš +možda +može +n +na +nadasve +naime +najzad +nako +nama +naoko +napolju +napose +naprimjer +naravno +nas +naši +nebi +ned +nedavno +nedjelja +negdje +neka +nekad +neki +neko +nešto +netom +nigdje +njegov +njen +njezin +njih +njihov +novemb +novembar +npr +nj +o +oba +obadvije +obično +oboje +oct +od +odakle +odatle +odmah +odnedavno +ok +oko +okt +oktobar +on +onaj +onda +one +oni +onomad +opće +opet +osam +osmi +osmica +osmina +ostali +otkad +otkako +otkuda +otuda +ovaj +ovako +ovamo +ovde +ovdje +p +pak +par +pedeset +pet +petak +peti +petica +petim +petnaest +pon +poned +ponedjeljak +ponekad +pošto +potom +pravo +prazan +pre +preko +pri +prije +pritom +prvi +prvo +puni +r +ranije +riječ +s +sad +sada +sam +sama +samo +sav +se +sebe +sedam +sedma +sedmica +sep +sept +septembar +shodno +si +sigurno +sinoć +smo +srijeda +ste +sto +stoga +stotina +str +su +sub +subota +sutra +svakako +svaki +sve +svejedno +svi +svijetlo +svima +svoj +svuda +š +šest +šesta +šesto +šta +štagod +što +štogod +štoviše +t +tad +tada +taj +taki +tako +takođe +također +takoreći +takve +tamo +te +tebi +tek +ti +tik +tj +tko +to +tobože +treća +treći +trenutno +tri +tu +tvoj +tzv +u +uime +umalo +umnogome +upravo +ustvari +uto +utoliko +utorak +uvijek +uzgred +uzmi +v +vako +vas +vaša +vazda +včrs +večeras +već +vi +vii +viii +x +y +z +za +zajedno +zapravo +zar +zasad +zašto +zatim +zato +zbog +ž \ No newline at end of file diff --git a/backend/addcorpus/stopword_data/supplementary_data/bulgarian b/backend/addcorpus/stopword_data/supplementary_data/bulgarian new file mode 100644 index 000000000..af9edf4dd --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/bulgarian @@ -0,0 +1,259 @@ +а +автентичен +аз +ако +ала +бе +без +беше +би +бивш +бивша +бившо +бил +била +били +било +благодаря +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +внимава +време +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главен +главна +главно +глас +го +година +години +годишен +д +да +дали +два +двама +двамата +две +двете +ден +днес +дни +до +добра +добре +добро +добър +докато +докога +дори +досега +доста +друг +друга +други +е +евтин +едва +един +една +еднаква +еднакви +еднакъв +едно +екип +ето +живот +за +забавям +зад +заедно +заради +засега +заспал +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +лесен +лесно +ли +лош +м +май +малко +ме +между +мек +мен +месец +ми +много +мнозина +мога +могат +може +мокър +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нещо +нея +ни +ние +никой +нито +нищо +но +нов +нова +нови +новина +някои +някой +няколко +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първата +първи +първо +пъти +равен +равна +с +са +сам +само +се +сега +си +син +скоро +след +следващ +сме +смях +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +т.н. +то +това +тогава +този +той +толкова +точно +три +трябва +тук +тъй +тя +тях +у +утре +харесва +хиляди +ч +часа +че +често +чрез +ще +щом +юмрук +я +як diff --git a/backend/addcorpus/stopword_data/supplementary_data/croatian b/backend/addcorpus/stopword_data/supplementary_data/croatian new file mode 100644 index 000000000..4921277b2 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/croatian @@ -0,0 +1,179 @@ +a +ako +ali +bi +bih +bila +bili +bilo +bio +bismo +biste +biti +bumo +da +do +duž +ga +hoće +hoćemo +hoćete +hoćeš +hoću +i +iako +ih +ili +iz +ja +je +jedna +jedne +jedno +jer +jesam +jesi +jesmo +jest +jeste +jesu +jim +joj +još +ju +kada +kako +kao +koja +koje +koji +kojima +koju +kroz +li +me +mene +meni +mi +mimo +moj +moja +moje +mu +na +nad +nakon +nam +nama +nas +naš +naša +naše +našeg +ne +nego +neka +neki +nekog +neku +nema +netko +neće +nećemo +nećete +nećeš +neću +nešto +ni +nije +nikoga +nikoje +nikoju +nisam +nisi +nismo +niste +nisu +njega +njegov +njegova +njegovo +njemu +njezin +njezina +njezino +njih +njihov +njihova +njihovo +njim +njima +njoj +nju +no +o +od +odmah +on +ona +oni +ono +ova +pa +pak +po +pod +pored +prije +s +sa +sam +samo +se +sebe +sebi +si +smo +ste +su +sve +svi +svog +svoj +svoja +svoje +svom +ta +tada +taj +tako +te +tebe +tebi +ti +to +toj +tome +tu +tvoj +tvoja +tvoje +u +uz +vam +vama +vas +vaš +vaša +vaše +već +vi +vrlo +za +zar +će +ćemo +ćete +ćeš +ću +što diff --git a/backend/addcorpus/stopword_data/supplementary_data/czech b/backend/addcorpus/stopword_data/supplementary_data/czech new file mode 100644 index 000000000..0e993ff6f --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/czech @@ -0,0 +1,138 @@ +dnes +cz +timto +budes +budem +byli +jses +muj +svym +ta +tomto +tohle +tuto +tyto +jej +zda +proc +mate +tato +kam +tohoto +kdo +kteri +mi +nam +tom +tomuto +mit +nic +proto +kterou +byla +toho +protoze +asi +ho +nasi +napiste +re +coz +tim +takze +svych +jeji +svymi +jste +aj +tu +tedy +teto +bylo +kde +ke +prave +ji +nad +nejsou +ci +pod +tema +mezi +pres +ty +pak +vam +ani +kdyz +vsak +ne +jsem +tento +clanku +clanky +aby +jsme +pred +pta +jejich +byl +jeste +az +bez +take +pouze +prvni +vase +ktera +nas +novy +tipy +pokud +muze +design +strana +jeho +sve +jine +zpravy +nove +neni +vas +jen +podle +zde +clanek +uz +email +byt +vice +bude +jiz +nez +ktery +by +ktere +co +nebo +ten +tak +ma +pri +od +po +jsou +jak +dalsi +ale +si +ve +to +jako +za +zpet +ze +do +pro +je +na diff --git a/backend/addcorpus/stopword_data/supplementary_data/estonian b/backend/addcorpus/stopword_data/supplementary_data/estonian new file mode 100644 index 000000000..091409454 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/estonian @@ -0,0 +1,35 @@ +aga +ei +et +ja +jah +kas +kui +kõik +ma +me +mida +midagi +mind +minu +mis +mu +mul +mulle +nad +nii +oled +olen +oli +oma +on +pole +sa +seda +see +selle +siin +siis +ta +te +ära \ No newline at end of file diff --git a/backend/addcorpus/stopword_data/supplementary_data/galician b/backend/addcorpus/stopword_data/supplementary_data/galician new file mode 100644 index 000000000..49d3b0277 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/galician @@ -0,0 +1,160 @@ +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/backend/addcorpus/stopword_data/supplementary_data/icelandic b/backend/addcorpus/stopword_data/supplementary_data/icelandic new file mode 100644 index 000000000..e4e300376 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/icelandic @@ -0,0 +1,58 @@ + +að +af +allur +annaðhvort +annar +á +eða +ef +eins +en +enda +enginn +ég +frá +hafa +hann +hinn +hjá +hún +hvaða +hver +hvor +hvorki +hvort +hvorugur +í +minn +munu +nálægt +neinn +nema +né +nokkur +og +ó +sá +sem +sinn +sjálfur +svo +til +undir +vegna +vera +verða +yfir +ýmis +það +þar +þegar +þess +þessi +þinn +þó +þótt +þú +æ diff --git a/backend/addcorpus/stopword_data/supplementary_data/latvian b/backend/addcorpus/stopword_data/supplementary_data/latvian new file mode 100644 index 000000000..39f427800 --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/latvian @@ -0,0 +1,163 @@ +ārpus +šaipus +aiz +ap +apakš +apakšpus +arī +ar +ar +augšpus +būšu +būs +būsi +būsiet +būsim +būt +bet +bez +bijām +bijāt +bija +biji +biju +caur +dēļ +diemžēl +diezin +droši +esam +esat +esi +esmu +gan +gar +iekām +iekāms +iekš +iekšpus +iekam +iekams +ik +ir +it +itin +iz +jā +ja +jau +jebšu +jeb +jel +jo +kā +kļūšu +kļūs +kļūsi +kļūsiet +kļūsim +kļūst +kļūstam +kļūstat +kļūsti +kļūstu +kļūt +kļuvām +kļuvāt +kļuva +kļuvi +kļuvu +ka +kamēr +kaut +kolīdz +kopš +līdz +līdzko +labad +lai +lejpus +nē +ne +nebūt +nedz +nekā +nevis +nezin +no +nu +otrpus +pār +pēc +pa +par +pat +pie +pirms +pret +priekš +starp +tā +tādēļ +tālab +tāpēc +taču +tad +tak +tapāt +tapšu +tapi +taps +tapsi +tapsiet +tapsim +tapt +te +tiec +tiek +tiekam +tiekat +tieku +tikām +tikāt +tikšu +tik +tika +tikai +tiki +tikko +tiklīdz +tiklab +tiks +tiksiet +tiksim +tikt +tiku +tikvien +tomēr +topat +turpretī +turpretim +un +uz +vai +varēšu +varējām +varējāt +varēja +varēji +varēju +varēs +varēsi +varēsiet +varēsim +varēt +var +varat +viņpus +vien +vien +virs +virspus +vis +zem diff --git a/backend/addcorpus/stopword_data/supplementary_data/serbian b/backend/addcorpus/stopword_data/supplementary_data/serbian new file mode 100644 index 000000000..6139e746b --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/serbian @@ -0,0 +1,133 @@ +baš +bez +biæe +bio +biti +blizu +broj +dana +danas +doæi +dobar +dobiti +dok +dole +došao +drugi +duž +dva +èesto +èiji +gde +gore +hvala +iæi +iako +ide +ima +imam +imao +ispod +izmeðu +iznad +izvan +izvoli +jedan +jedini +jednom +jeste +još +juèe +kad +kako +kao +koga +koja +koje +koji +kroz +mali +manji +misli +mnogo +moæi +mogu +mora +morao +naæi +naš +negde +nego +nekad +neki +nemam +nešto +nije +nijedan +nikada +nismo +ništa +njega +njegov +njen +njih +njihov +oko +okolo +ona +onaj +oni +ono +osim +ostali +otišao +ovako +ovamo +ovde +ove +ovo +pitati +poèetak +pojedini +posle +povodom +praviti +pre +preko +prema +prvi +put +radije +sada +smeti +šta +stvar +stvarno +sutra +svaki +sve +svim +svugde +taèno +tada +taj +takoðe +tamo +tim +uèinio +uèiniti +umalo +unutra +upotrebiti +uzeti +vaš +veæina +veoma +video +više +zahvaliti +zašto +zbog +želeo +želi +znati diff --git a/backend/addcorpus/stopword_data/supplementary_data/slovenian b/backend/addcorpus/stopword_data/supplementary_data/slovenian new file mode 100644 index 000000000..7135ed3ab --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/slovenian @@ -0,0 +1,446 @@ +a +ali +april +avgust +b +bi +bil +bila +bile +bili +bilo +biti +blizu +bo +bodo +bojo +bolj +bom +bomo +boste +bova +boš +brez +c +cel +cela +celi +celo +d +da +daleč +dan +danes +datum +december +deset +deseta +deseti +deseto +devet +deveta +deveti +deveto +do +dober +dobra +dobri +dobro +dokler +dol +dolg +dolga +dolgi +dovolj +drug +druga +drugi +drugo +dva +dve +e +eden +en +ena +ene +eni +enkrat +eno +etc. +f +februar +g +g. +ga +ga. +gor +gospa +gospod +h +halo +i +idr. +ii +iii +in +iv +ix +iz +j +januar +jaz +je +ji +jih +jim +jo +julij +junij +jutri +k +kadarkoli +kaj +kajti +kako +kakor +kamor +kamorkoli +kar +karkoli +katerikoli +kdaj +kdo +kdorkoli +ker +ki +kje +kjer +kjerkoli +ko +koder +koderkoli +koga +komu +kot +kratek +kratka +kratke +kratki +l +lahka +lahke +lahki +lahko +le +lep +lepa +lepe +lepi +lepo +leto +m +maj +majhen +majhna +majhni +malce +malo +manj +marec +me +med +medtem +mene +mesec +mi +midva +midve +mnogo +moj +moja +moje +mora +morajo +moram +moramo +morate +moraš +morem +mu +n +na +nad +naj +najina +najino +najmanj +naju +največ +nam +narobe +nas +nato +nazaj +naš +naša +naše +ne +nedavno +nedelja +nek +neka +nekaj +nekatere +nekateri +nekatero +nekdo +neke +nekega +neki +nekje +neko +nekoga +nekoč +ni +nikamor +nikdar +nikjer +nikoli +nič +nje +njega +njegov +njegova +njegovo +njej +njemu +njen +njena +njeno +nji +njih +njihov +njihova +njihovo +njiju +njim +njo +njun +njuna +njuno +no +nocoj +november +npr. +o +ob +oba +obe +oboje +od +odprt +odprta +odprti +okoli +oktober +on +onadva +one +oni +onidve +osem +osma +osmi +osmo +oz. +p +pa +pet +peta +petek +peti +peto +po +pod +pogosto +poleg +poln +polna +polni +polno +ponavadi +ponedeljek +ponovno +potem +povsod +pozdravljen +pozdravljeni +prav +prava +prave +pravi +pravo +prazen +prazna +prazno +prbl. +precej +pred +prej +preko +pri +pribl. +približno +primer +pripravljen +pripravljena +pripravljeni +proti +prva +prvi +prvo +r +ravno +redko +res +reč +s +saj +sam +sama +same +sami +samo +se +sebe +sebi +sedaj +sedem +sedma +sedmi +sedmo +sem +september +seveda +si +sicer +skoraj +skozi +slab +smo +so +sobota +spet +sreda +srednja +srednji +sta +ste +stran +stvar +sva +t +ta +tak +taka +take +taki +tako +takoj +tam +te +tebe +tebi +tega +težak +težka +težki +težko +ti +tista +tiste +tisti +tisto +tj. +tja +to +toda +torek +tretja +tretje +tretji +tri +tu +tudi +tukaj +tvoj +tvoja +tvoje +u +v +vaju +vam +vas +vaš +vaša +vaše +ve +vedno +velik +velika +veliki +veliko +vendar +ves +več +vi +vidva +vii +viii +visok +visoka +visoke +visoki +vsa +vsaj +vsak +vsaka +vsakdo +vsake +vsaki +vsakomur +vse +vsega +vsi +vso +včasih +včeraj +x +z +za +zadaj +zadnji +zakaj +zaprta +zaprti +zaprto +zdaj +zelo +zunaj +č +če +često +četrta +četrtek +četrti +četrto +čez +čigav +š +šest +šesta +šesti +šesto +štiri +ž +že \ No newline at end of file diff --git a/backend/addcorpus/stopword_data/supplementary_data/ukrainian b/backend/addcorpus/stopword_data/supplementary_data/ukrainian new file mode 100644 index 000000000..1f6a6bb7f --- /dev/null +++ b/backend/addcorpus/stopword_data/supplementary_data/ukrainian @@ -0,0 +1,28 @@ +з +й +що +та +Із +але +цей +коли +як +чого +хоча +нам +яко╞ +чи +це +в╡д +про +╡ +╞х +╙ +Інших +ти +вІн +вона +воно +ми +ви +вони diff --git a/backend/corpora/parliament/clarin_parlamint/description/parlamint_all.md b/backend/corpora/parliament/clarin_parlamint/description/parlamint_all.md new file mode 100644 index 000000000..4b8d44592 --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/description/parlamint_all.md @@ -0,0 +1,19 @@ +# ParlaMint: Comparable and Interoperable Parliamentary Corpora +The CLARIN ERIC flagship project ParlaMint enables transnational and multidisciplinary analyses of parliamentary debates across 29 countries and autonomous regions in Europe. + +Overcoming the obstacles of multilinguality and diversity of data formats, the project created interoperable and comparable corpora that facilitate transnational comparisons and enhance the understanding of parliamentary discourse and its societal impact locally and globally. + +The corpora are available in open access and are a valuable source of information for researchers in a broad range of SSH disciplines, such as political and social sciences, media and communication studies, history and language studies, and are also relevant to policy makers. + +The ParlaMint project is now being further developed in the OSCARS project [ParlaCAP](https://clarinsi.github.io/parlacap/), which will provide a robust dataset for tracking political agenda-setting across European parliaments. + +The latest versions of the corpora are available under the CC BY license: + +Erjavec, Tomaž; et al., 2025, Multilingual comparable corpora of parliamentary debates ParlaMint 5.0, Slovenian language resource repository CLARIN.SI, ISSN 2820-4042, http://hdl.handle.net/11356/2004. + +Erjavec, Tomaž; et al., 2025, Linguistically annotated multilingual comparable corpora of parliamentary debates ParlaMint.ana 5.0, Slovenian language resource repository CLARIN.SI, ISSN 2820-4042, http://hdl.handle.net/11356/2005. + +Kuzman Pungeršek, Taja; et al., 2025, Linguistically annotated multilingual comparable corpora of parliamentary debates in English ParlaMint-en.ana 5.0, Slovenian language resource repository CLARIN.SI, ISSN 2820-4042, http://hdl.handle.net/11356/2006. + +[GitHub repository](https://github.com/clarin-eric/ParlaMint/) including corpus samples, encoding guidelines, XML schema, and scripts for corpus processing and validation + diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint.png b/backend/corpora/parliament/clarin_parlamint/images/parlamint.png new file mode 100644 index 000000000..8e37f11bb Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint.png differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_AT.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_AT.jpg new file mode 100644 index 000000000..320ca097d Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_AT.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_BA.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BA.jpg new file mode 100644 index 000000000..e0eb03cc6 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BA.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_BE.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BE.jpg new file mode 100644 index 000000000..06bc79cc6 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BE.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_BG.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BG.jpg new file mode 100644 index 000000000..51d1bbfe7 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_BG.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_CZ.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_CZ.jpg new file mode 100644 index 000000000..7647801e8 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_CZ.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_DK.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_DK.jpg new file mode 100644 index 000000000..7761f22a0 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_DK.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_EE.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_EE.jpg new file mode 100644 index 000000000..7ed79af4c Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_EE.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-CT.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-CT.jpg new file mode 100644 index 000000000..bf2a0bf20 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-CT.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-GA.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-GA.jpg new file mode 100644 index 000000000..b8958fd1a Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-GA.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-PV.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-PV.jpg new file mode 100644 index 000000000..b1a3a5456 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES-PV.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES.jpg new file mode 100644 index 000000000..759a16989 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_ES.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_FI.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_FI.jpg new file mode 100644 index 000000000..501a224cf Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_FI.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_FR.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_FR.jpg new file mode 100644 index 000000000..f2e393a07 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_FR.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_GB.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_GB.jpg new file mode 100644 index 000000000..64e3a9d5b Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_GB.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_GR.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_GR.jpg new file mode 100644 index 000000000..82ed02482 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_GR.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_HR.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_HR.jpg new file mode 100644 index 000000000..2e00f6684 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_HR.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_HU.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_HU.jpg new file mode 100644 index 000000000..8c92a16da Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_HU.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_IS.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_IS.jpg new file mode 100644 index 000000000..7772c1238 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_IS.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_IT.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_IT.jpg new file mode 100644 index 000000000..82e2fd195 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_IT.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_LV.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_LV.jpg new file mode 100644 index 000000000..c2344fbee Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_LV.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_NL.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_NL.jpg new file mode 100644 index 000000000..5182403a4 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_NL.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_NO.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_NO.jpg new file mode 100644 index 000000000..ee56adfd1 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_NO.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_PL.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_PL.jpg new file mode 100644 index 000000000..07eae6bab Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_PL.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_PT.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_PT.jpg new file mode 100644 index 000000000..a620d5bab Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_PT.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_RS.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_RS.jpg new file mode 100644 index 000000000..d67d4f99b Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_RS.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_SE.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_SE.jpg new file mode 100644 index 000000000..c99638c7c Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_SE.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_SI.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_SI.jpg new file mode 100644 index 000000000..1a4636fc0 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_SI.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_TR.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_TR.jpg new file mode 100644 index 000000000..e5b06c0d8 Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_TR.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/images/parlamint_UA.jpg b/backend/corpora/parliament/clarin_parlamint/images/parlamint_UA.jpg new file mode 100644 index 000000000..b3eeb77da Binary files /dev/null and b/backend/corpora/parliament/clarin_parlamint/images/parlamint_UA.jpg differ diff --git a/backend/corpora/parliament/clarin_parlamint/parlamint_all.py b/backend/corpora/parliament/clarin_parlamint/parlamint_all.py new file mode 100644 index 000000000..ccf9043cc --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/parlamint_all.py @@ -0,0 +1,336 @@ +import os +import re +from glob import glob +from bs4 import BeautifulSoup +from datetime import datetime + +from django.conf import settings + +from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition +from addcorpus.python_corpora.filters import MultipleChoiceFilter +from addcorpus.es_settings import es_settings +from addcorpus.es_mappings import keyword_mapping, main_content_mapping, non_indexed_text_mapping +from corpora.utils.constants import document_context +import corpora.parliament.utils.field_defaults as field_defaults +from corpora.parliament.utils.parlamint_v4 import ner_keyword_field, speech_ner + + +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_constants import COUNTRY_CODES, COUNTRY_CODE_TO_NAME, DATE_RANGES, PARLIAMENT_NAMES +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_extract import get_orgs_metadata, get_persons_metadata, extract_named_entities, person_attribute_extractor, extract_speech, party_attribute_extractor, current_party_id_extractor, get_party_list +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_transform import transform_xml_filename, transform_ministerial_role, transform_parliamentary_role, transform_political_orientation, transform_government + +from ianalyzer_readers.extract import XML, Combined, Order, Metadata, Pass +from ianalyzer_readers.xml_tag import Tag + +def open_xml_as_soup(filepath): + with open(filepath, 'rb') as f: + soup = BeautifulSoup(f, features="xml") + return soup + +class ParlaMintAll(XMLCorpusDefinition): + title = "All ParlaMint Corpora (version 5.0)" + description = "All corpora from the ParlaMint dataset, including 27 countries" + category = "parliament" + image = "parlamint.png" + description_page = "parlamint_all.md" + languages = ['en'] + data_directory = settings.PARLAMINT_ALL_DATA + visualize = [] + + es_index = getattr(settings, "PARLAMINT_ALL_INDEX", 'parlamint-all') + @property + def es_settings(self): + return es_settings(self.languages, stopword_analysis=True, stemming_analysis=True) + + default_sort = {'field': 'date', 'ascending': False} + + min_date = datetime(year=1996, month=1, day=1) + max_date = datetime(year=2022, month=12, day=31) + + document_context = document_context() + tag_toplevel = Tag('TEI') + tag_entry = Tag('u') + + country_codes = COUNTRY_CODES + country_code = None + + def sources(self, *args, **kwargs): + for country_code in self.country_codes: + print("STARTING COUNTRY: ", country_code) + country_data_directory = os.path.join(self.data_directory, "ParlaMint-{}".format(country_code), "ParlaMint-{}.TEI.ana".format(country_code)) + country_translated_data_directory = os.path.join(self.data_directory, "ParlaMint-{}".format(country_code), "ParlaMint-{}-en.TEI.ana".format(country_code)) + persons_metadata = get_persons_metadata(country_data_directory, country_code) + all_orgs_metadata = get_orgs_metadata(country_data_directory, country_code) + party_list = get_party_list(all_orgs_metadata) + metadata = { + 'persons': persons_metadata, + 'organisations': all_orgs_metadata, + 'party_list': party_list, + 'country': country_code + } + for year in range(DATE_RANGES[country_code]['min_year'], DATE_RANGES[country_code]['max_year']): + for xml_file in glob('{}/ParlaMint-{}/ParlaMint-{}.TEI.ana/{}/*.xml'.format(self.data_directory, country_code, country_code, year)): + metadata['date'] = re.search(r"\d{4}-\d{2}-\d{2}", xml_file).group() + metadata["ner"] = extract_named_entities(xml_file) + translated_file_path = os.path.join( + country_translated_data_directory, + str(year), + transform_xml_filename(xml_file, country_code) + ) if country_code != 'GB' else xml_file # the UK can reuse the original path + if os.path.exists(translated_file_path): + metadata['translated_soup'] = open_xml_as_soup(translated_file_path) + yield xml_file, metadata + + chamber = FieldDefinition( + name='chamber', + display_name='Chamber', + description='In which chamber or house of parliament in which the speeches took place', + searchable=False, + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Search from speeches from the selected chamber', + option_count=5 + ), + extractor=XML( + Tag('teiHeader'), + Tag('fileDesc'), + Tag('titleStmt'), + Tag('meeting'), + toplevel=True, + attribute='corresp', + transform= lambda x: PARLIAMENT_NAMES[x] if x else None + ), + visualizations=['resultscount', 'termfrequency'] + ) + + country = FieldDefinition( + name='country', + display_name='Country', + description='Country in which the debate took place', + searchable=False, + es_mapping=keyword_mapping(), + search_filter = MultipleChoiceFilter( + description='Search for speeches from the selected countries', + option_count=len(COUNTRY_CODES) + ), + results_overview = True, + extractor = Metadata('country', transform=lambda country_code: COUNTRY_CODE_TO_NAME[country_code]), + visualizations=["resultscount", "termfrequency"] + ) + + date = field_defaults.date(min_date=min_date, max_date=max_date) + date.extractor = Metadata('date') + + debate_id = field_defaults.debate_id() + debate_id.extractor = XML( + attribute='xml:id', + toplevel=True, + ) + + speech_id = field_defaults.speech_id() + speech_id.extractor = XML( + attribute='xml:id' + ) + + speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + # stemming or stopword analysis is set by the child corpus + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=False, + stemming_analysis=False, + language=None, + ), + results_overview=False, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=None, + ) + speech.extractor = XML( + Tag('s'), + multiple=True, + extract_soup_func = extract_speech, + transform=' '.join) + + def lookup_translated_speech(tuple): + element = tuple[1].find(attrs={'xml:id': tuple[0]}) + return extract_speech(element) if element else None + + speech_translated = FieldDefinition( + name='speech_translated', + display_name='Speech (machine-translated)', + description='The machine-translated speech', + # Translated field does have stemming and stopword analysis + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language='en', + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language='en' + ) + speech_translated.extractor = Combined( + XML(attribute='xml:id'), + Metadata('translated_soup'), + transform=lookup_translated_speech + ) + + speech_ner = speech_ner() + + ner_per = ner_keyword_field("person") + ner_loc = ner_keyword_field("location") + ner_org = ner_keyword_field("organization") + ner_misc = ner_keyword_field("miscellaneous") + + sequence = field_defaults.sequence() + sequence.extractor = Order(transform=lambda value: value + 1) + + speaker = field_defaults.speaker() + speaker.extractor = person_attribute_extractor('name') + + speaker_id = field_defaults.speaker_id() + speaker_id.extractor = person_attribute_extractor('id') + + speaker_gender = field_defaults.speaker_gender() + speaker_gender.extractor = person_attribute_extractor('gender') + + speaker_birth_year = field_defaults.speaker_birth_year() + speaker_birth_year.extractor = person_attribute_extractor('birth_year') + + speaker_birthplace = field_defaults.speaker_birthplace() + speaker_birthplace.extractor = person_attribute_extractor('birthplace') + + speaker_wikimedia = FieldDefinition( + name = 'speaker_wikimedia', + display_name= 'Speaker Wikipedia', + display_type='url', + description='URL to Wikimedia page of the speaker', + es_mapping=keyword_mapping(), + searchable=False, + ) + + speaker_twitter = FieldDefinition( + name = 'speaker_twitter', + display_name= 'Speaker Twitter', + display_type='url', + description='URL to Twitter page of the speaker', + es_mapping=keyword_mapping(), + searchable=False, + ) + + government = FieldDefinition( + name='government', + display_name='Government', + description='Whether speaker is part of the government', + es_mapping=keyword_mapping(), + searchable=False, + search_filter = MultipleChoiceFilter( + description='Search speeches by members of government', + option_count=2 + ), + extractor = Combined( + person_attribute_extractor('org_nodes'), + Metadata('date'), + Metadata('country'), + transform=transform_government + ) + ) + + parliamentary_role = field_defaults.parliamentary_role() + parliamentary_role.extractor = Combined( + person_attribute_extractor('org_nodes'), + Metadata('date'), + Metadata('country'), + transform=transform_parliamentary_role + ) + ministerial_role = field_defaults.ministerial_role() + ministerial_role.extractor = Combined( + person_attribute_extractor('org_nodes'), + Metadata('date'), + Metadata('country'), + transform=transform_ministerial_role + ) + + current_party_id = field_defaults.party_id() + current_party_id.extractor = current_party_id_extractor() + + current_party = field_defaults.party() + current_party.extractor = party_attribute_extractor('name') + + current_party_full = field_defaults.party_full() + current_party_full.extractor = party_attribute_extractor('full_name') + + current_party_wiki = FieldDefinition( + name='party_wiki_url', + display_name='Wikimedia URL', + display_type='url', + description='URL to Wikimedia page of the party', + es_mapping=keyword_mapping(), + searchable=False, + ) + current_party_wiki.extractor = party_attribute_extractor('wikimedia') + + current_party_political_orientation = FieldDefinition( + name='political_orientation', + display_name='Political Orientation', + description="Political leaning according to the ParlaMint team", + es_mapping=keyword_mapping(), + searchable=False, + search_filter = MultipleChoiceFilter( + description='Search for speeches from selected political leanings', + ), + ) + current_party_political_orientation.extractor = Pass( + party_attribute_extractor('political_orientation'), + transform=transform_political_orientation + ) + + @property + def fields(self): + return self._fields + + @fields.setter + def fields(self, value): + self._fields = value + + def __init__(self): + self.fields = [ + self.speech_translated, + self.speech, + self.chamber, + self.country, + self.date, + self.debate_id, + self.speech_id, + self.speech_ner, + self.sequence, + self.speaker, + self.speaker_id, + self.speaker_gender, + self.speaker_birth_year, + self.speaker_birthplace, + self.speaker_wikimedia, + self.speaker_twitter, + self.government, + self.parliamentary_role, + self.ministerial_role, + self.current_party_id, + self.current_party, + self.current_party_full, + self.current_party_wiki, + self.current_party_political_orientation, + self.ner_per, + self.ner_loc, + self.ner_org, + self.ner_misc + ] + diff --git a/backend/corpora/parliament/clarin_parlamint/parlamint_subcorpora.py b/backend/corpora/parliament/clarin_parlamint/parlamint_subcorpora.py new file mode 100644 index 000000000..2b522d0f4 --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/parlamint_subcorpora.py @@ -0,0 +1,1047 @@ +from datetime import datetime +import os +import re +from glob import glob + +from addcorpus.python_corpora.corpus import FieldDefinition +from addcorpus.es_mappings import main_content_mapping +from corpora.parliament.clarin_parlamint.parlamint_all import ParlaMintAll, open_xml_as_soup +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_constants import DATE_RANGES +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_extract import get_orgs_metadata, get_persons_metadata, extract_named_entities, extract_speech, get_party_list + +from ianalyzer_readers.extract import XML +from ianalyzer_readers.xml_tag import Tag + + +def speech_extractor(): + return XML( + Tag('s'), + multiple=True, + extract_soup_func = extract_speech, + transform=' '.join + ) + +class _ParlaMint(ParlaMintAll): + ''' + Parent class for country-specific ParlaMint corpora + ''' + country_code = None + description_page = None + + @property + def es_index(self): + ''' + this property expects the full parlamint corpus to be named + 'parlamint-all', and will produce 'parlamint-at' for Austria for example + ''' + return super().es_index.replace('all', self.country_code.lower()) + + @property + def image(self): + return f'parlamint_{self.country_code}.jpg' + + def __init__(self): + super().__init__() + + +class ParlaMintAT(_ParlaMint): + title = "Austria" + description = 'Speeches and debates from the national and federal councils of Austria' + country_code = 'AT' + country_codes = ['AT'] + languages = ['de', 'en'] + min_date = datetime(year=DATE_RANGES['AT']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['AT']['max_year'], month=12, day=31) + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintBA(_ParlaMint): + title = "Bosnia" + description = 'Speeches and debates from the Bosnian Parliament' + country_code = 'BA' + country_codes = ['BA'] + languages = ['bs', 'en'] + min_date = datetime(year=DATE_RANGES['BA']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['BA']['max_year'], month=12, day=31) + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintBE(_ParlaMint): + title = "Belgium" + description = 'Speeches and debates from the Belgian Parliament' + country_code = 'BE' + country_codes = ['BE'] + languages = ['nl', 'fr', 'en'] + min_date = datetime(year=DATE_RANGES['BE']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['BE']['max_year'], month=12, day=31) + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintBG(_ParlaMint): + title = "Bulgaria" + description = 'Speeches and debates from the Bulgarian Parliament' + country_code = 'BG' + country_codes = ['BG'] + languages = ['bg', 'en'] + min_date = datetime(year=DATE_RANGES['BG']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['BG']['max_year'], month=12, day=31) + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintCZ(_ParlaMint): + title = "The Czech Republic" + description = 'Speeches and debates from the chambers of the Czech Parliament' + country_code = 'CZ' + country_codes = ['CZ'] + languages = ['cs', 'en'] + min_date = datetime(year=DATE_RANGES['CZ']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['CZ']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintDK(_ParlaMint): + title = "Denmark" + description = 'Speeches and debates from the Danish Parliament' + country_code = 'DK' + country_codes = ['DK'] + languages = ['da', 'en'] + min_date = datetime(year=DATE_RANGES['DK']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['DK']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintEE(_ParlaMint): + title = "Estonia" + description = 'Speeches and debates from the Estonian Parliament' + country_code = 'EE' + country_codes = ['EE'] + languages = ['et', 'en'] + min_date = datetime(year=DATE_RANGES['EE']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['EE']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintES(_ParlaMint): + title = "Spain" + description = 'Speeches and debates from the Spanish Parliament' + country_code = 'ES' + country_codes = ['ES'] + languages = ['es', 'en'] + min_date = datetime(year=DATE_RANGES['ES']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['ES']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintESCT(_ParlaMint): + title = "Catelonia (Spain)" + description = 'Speeches and debates from the Catelonian Parliament' + country_code = 'ES-CT' + country_codes = ['ES-CT'] + languages = ['ca', 'en'] + min_date = datetime(year=DATE_RANGES['ES-CT']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['ES-CT']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintESGA(_ParlaMint): + title = "Galicia (Spain)" + description = 'Speeches and debates from the Galician Parliament' + country_code = 'ES-GA' + country_codes = ['ES-GA'] + languages = ['gl', 'en'] + min_date = datetime(year=DATE_RANGES['ES-GA']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['ES-GA']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + +class ParlaMintESPV(_ParlaMint): + title = "Basque Country (Spain)" + description = 'Speeches and debates from the Basque Parliament' + country_code = 'ES-PV' + country_codes = ['ES-PV'] + languages = ['eu', 'es', 'en'] + min_date = datetime(year=DATE_RANGES['ES-PV']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['ES-PV']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintFI(_ParlaMint): + title = "Finland" + description = 'Speeches and debates from the Finnish Parliament' + country_code = 'FI' + country_codes = ['FI'] + languages = ['fi', 'en'] + min_date = datetime(year=DATE_RANGES['FI']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['FI']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintFR(_ParlaMint): + title = "France" + description = 'Speeches and debates from the French Parliament' + country_code = 'FR' + country_codes = ['FR'] + languages = ['fr', 'en'] + min_date = datetime(year=DATE_RANGES['FR']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['FR']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintGB(_ParlaMint): + title = "United Kingdom" + description = 'Speeches and debates from the British houses of Parliament' + country_code = 'GB' + country_codes = ['GB'] + languages = ['en'] + min_date = datetime(year=DATE_RANGES['GB']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['GB']['max_year'], month=12, day=31) + + def sources(self, *args, **kwargs): + ''' + UK-specific sources function to simply reuse the original data for the machine-translated speech field + ''' + country_code = self.country_code + print("STARTING COUNTRY: GB") + country_data_directory = os.path.join(self.data_directory, "ParlaMint-{}".format(country_code), "ParlaMint-{}.TEI.ana".format(country_code)) + persons_metadata = get_persons_metadata(country_data_directory, country_code) + all_orgs_metadata = get_orgs_metadata(country_data_directory, country_code) + party_list = get_party_list(all_orgs_metadata) + metadata = { + 'persons': persons_metadata, + 'organisations': all_orgs_metadata, + 'party_list': party_list, + 'country': country_code + } + for year in range(DATE_RANGES[country_code]['min_year'], DATE_RANGES[country_code]['max_year']): + for xml_file in glob('{}/ParlaMint-{}/ParlaMint-{}.TEI.ana/{}/*.xml'.format(self.data_directory, country_code, country_code, year)): + metadata['date'] = re.search(r"\d{4}-\d{2}-\d{2}", xml_file).group() + metadata["ner"] = extract_named_entities(xml_file) + if os.path.exists(xml_file): + metadata['translated_soup'] = open_xml_as_soup(xml_file) + yield xml_file, metadata + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintGR(_ParlaMint): + title = "Greece" + description = 'Speeches and debates from the Greek Parliament' + country_code = 'GR' + country_codes = ['GR'] + languages = ['el', 'en'] + min_date = datetime(year=DATE_RANGES['GR']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['GR']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintHR(_ParlaMint): + title = "Croatia" + description = 'Speeches and debates from the Croatian Parliament' + country_code = 'HR' + country_codes = ['HR'] + languages = ['hr', 'en'] + min_date = datetime(year=DATE_RANGES['HR']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['HR']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintHU(_ParlaMint): + title = "Hungary" + description = 'Speeches and debates from the Hungarian Parliament' + country_code = 'HU' + country_codes = ['HU'] + languages = ['hu', 'en'] + min_date = datetime(year=DATE_RANGES['HU']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['HU']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintIS(_ParlaMint): + title = "Iceland" + description = 'Speeches and debates from the Icelandic Parliament' + country_code = 'IS' + country_codes = ['IS'] + languages = ['is', 'en'] + min_date = datetime(year=DATE_RANGES['IS']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['IS']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no icelandic stemming for ES + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintIT(_ParlaMint): + title = "Italy" + description = 'Speeches and debates from the Italian Parliament' + country_code = 'IT' + country_codes = ['IT'] + languages = ['it', 'en'] + min_date = datetime(year=DATE_RANGES['IT']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['IT']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintLV(_ParlaMint): + title = "Latvia" + description = 'Speeches and debates from the Latvian Parliament' + country_code = 'LV' + country_codes = ['LV'] + languages = ['lv', 'en'] + min_date = datetime(year=DATE_RANGES['LV']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['LV']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintNL(_ParlaMint): + title = "The Netherlands" + description = 'Speeches and debates from the two chambers of Dutch Parliament' + country_code = 'NL' + country_codes = ['NL'] + languages = ['nl', 'en'] + min_date = datetime(year=DATE_RANGES['NL']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['NL']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintNO(_ParlaMint): + title = "Norway" + description = 'Speeches and debates from the Norwegian Parliament' + country_code = 'NO' + country_codes = ['NO'] + languages = ['no', 'en'] + min_date = datetime(year=DATE_RANGES['NO']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['NO']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintPL(_ParlaMint): + title = "Poland" + description = 'Speeches and debates from the two chambers of the Polish Parliament' + country_code = 'PL' + country_codes = ['PL'] + languages = ['pl', 'en'] + min_date = datetime(year=DATE_RANGES['PL']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['PL']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintPT(_ParlaMint): + title = "Portugal" + description = 'Speeches and debates from the two chambers of the Portuguese Parliament' + country_code = 'PT' + country_codes = ['PT'] + languages = ['pt', 'en'] + min_date = datetime(year=DATE_RANGES['PT']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['PT']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintRS(_ParlaMint): + title = "Serbia" + description = 'Speeches and debates from the two chambers of the Serbian Parliament' + country_code = 'RS' + country_codes = ['RS'] + languages = ['sr', 'en'] + min_date = datetime(year=DATE_RANGES['RS']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['RS']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no stemmer for Serbian in ES + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintRS(_ParlaMint): + title = "Serbia" + description = 'Speeches and debates from the two chambers of the Serbian Parliament' + country_code = 'RS' + country_codes = ['RS'] + languages = ['sr', 'en'] + min_date = datetime(year=DATE_RANGES['RS']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['RS']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no stemmer for Serbian in ES + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintSE(_ParlaMint): + title = "Sweden" + description = 'Speeches and debates from the Swedish Parliament' + country_code = 'SE' + country_codes = ['SE'] + languages = ['sv', 'en'] + min_date = datetime(year=DATE_RANGES['SE']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['SE']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=True, + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintSI(_ParlaMint): + title = "Slovenia" + description = 'Speeches and debates from the Slovenian Parliament' + country_code = 'SI' + country_codes = ['SI'] + languages = ['sl', 'en'] + min_date = datetime(year=DATE_RANGES['SI']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['SI']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no stemming for Slovenian in SE + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintTR(_ParlaMint): + title = "Türkiye" + description = 'Speeches and debates from the Turkish Parliament' + country_code = 'TR' + country_codes = ['TR'] + languages = ['tr', 'en'] + min_date = datetime(year=DATE_RANGES['TR']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['TR']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no stemming for Slovenian in SE + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] + + +class ParlaMintUA(_ParlaMint): + title = "Ukraine" + description = 'Speeches and debates from the Ukrainian Parliament' + country_code = 'UA' + country_codes = ['UA'] + languages = ['uk', 'en'] + min_date = datetime(year=DATE_RANGES['UA']['min_year'], month=1, day=1) + max_date = datetime(year=DATE_RANGES['UA']['max_year'], month=12, day=31) + + + def __init__(self): + super().__init__() + self.speech = FieldDefinition( + name='speech', + display_name='Speech', + description='The transcribed speech in the original language', + es_mapping = main_content_mapping( + token_counts=True, + stopword_analysis=True, + stemming_analysis=False, # no stemming for Ukrainian in SE + language=self.languages[0], + ), + results_overview=True, + search_field_core=True, + display_type='text_content', + visualizations=['wordcloud', 'ngram'], + csv_core=True, + language=self.languages[0], + ) + self.speech.extractor = speech_extractor() + self.fields = [self.speech] + [field for field in self.fields if field.name != 'speech'] diff --git a/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_constants.py b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_constants.py new file mode 100644 index 000000000..a989c57bf --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_constants.py @@ -0,0 +1,257 @@ +COUNTRY_CODES = [ + 'AT', + 'BA', + 'BE', + 'BG', + 'CZ', + 'DK', + 'EE', + 'ES', + 'ES-CT', + 'ES-GA', + 'ES-PV', + 'FI', + 'FR', + 'GB', + 'GR', + 'HR', + 'HU', + 'IS', + 'IT', + 'LV', + 'NL', + 'NO', + 'PL', + 'PT', + 'RS', + 'SE', + 'SI', + 'TR', + 'UA' +] + +COUNTRY_CODE_TO_NAME = { + 'AT': 'Austria', + 'BA': 'Bosnia', + 'BE': 'Belgium', + 'BG': 'Bulgaria', + 'CZ': 'the Czech Republic', + 'DK': 'Denmark', + 'EE': 'Estonia', + 'ES': 'Spain', + 'ES-CT': 'Spain - Catelonia', + 'ES-GA': 'Spain - Galicia', + 'ES-PV': 'Spain - Basque Country', + 'FI': 'Finland', + 'FR': 'France', + 'GB': 'the United Kingdom', + 'GR': 'Greece', + 'HR': 'Croatia', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IT': 'Italy', + 'LV': 'Latvia', + 'NL': 'the Netherlands', + 'NO': 'Norway', + 'PL': 'Poland', + 'PT': 'Portugal', + 'RS': 'Serbia', + 'SE': 'Sweden', + 'SI': 'Slovenia', + 'TR': 'Türkiye', + 'UA': 'Ukraine' +} + +COUNTRY_GOVERNMENTS = { + 'AT': '#GOV', + 'BA': '#government.BA', + 'BE': '#government.BE', + 'BG': '#gov.BGGov', + 'CZ': '#government.VladaCR', + 'DK': '#ParlaMint-DK-GOV', + 'EE': '#government.Valitsus', + 'ES': '#GOV', + 'ES-CT': '#GOV', + 'ES-GA': '#GOV', + 'ES-PV': '#government.PV', + 'FI': '#GOV', + 'FR': '#ParlaMint-FR-GOV', + 'GB': '#GoGB', + 'GR': '#GOV', + 'HR': '#government.HR', + 'HU': '#GOV', + 'IS': '#GOV_LV', + 'IT': '#GOV', + 'LV': 'GOV', + 'NL': 'GOV', + 'NO': 'government.NO', + 'PL': 'government.PL', + 'PT': 'Government', + 'RS': 'government.RS', + 'SE': 'GOV', + 'SI': 'parliament', + 'TR': 'GOV_TR', + 'UA': 'GOV.UA' +} + +COUNTRY_PARLIAMENTS = { + 'AT': ['#NR', '#BR'], + 'BA': ['#PS'], + 'BE': ['#be_federal_parliament'], + 'BG': ['#BGNS'], + 'CZ': ['#chamberOfNations.FSSN', '#chamberOfPeople.FSSL', '#nationalCouncil.CNR', + '#parliament.PSP', '#senate.SE', '#senate.SE14'], + 'DK': ['#FT'], + 'EE': ['#ee_parliament'], + 'ES': ['#CD'], + 'ES-CT': ['#PC'], + 'ES-GA': ['#PG'], + 'ES-PV': ['#ES-PV'], + 'FI': ['#fi_parliament'], + 'FR': ['#ParlaMint-FR-LOWER'], + 'GB': ['#parliament.HC', '#parliament.HL'], + 'GR': ['#PoGR'], + 'HR': ['#HS'], + 'HU': ['#OGY'], + 'IS': ['#LV'], + 'IT': ['#LEG'], + 'LV': ['#PT'], + 'NL': ['#EK', '#TK'], + 'NO': ['#ST', '#LT', '#OT'], + 'PL': ['#parliament.Sejm', '#parliament.Senat'], + 'PT': ['#Parliament'], + 'RS': ['#NS'], + 'SE': ['#Riksdagen'], + 'SI': ['#DZ'], + 'TR': ['#TBMM'], + 'UA': ['#ВРУ'] +} + +PARLIAMENT_NAMES = { + '#NR': 'Nationalrat (Austria)', + '#BR': 'Bundesrat (Austria)', + '#PS': 'Parliamentary Assembly of Bosnia and Herzegovina', + '#be_federal_parliament': 'Federaal parlament (Belgium)', + '#BGNS': 'Народно събрание (Bulgaria)', + '#chamberOfNations.FSSN': 'Chamber of Nations (Czech Republic)', + '#chamberOfPeople.FSSL': 'Chamber of People (Czech Republic)', + '#nationalCouncil.CNR': 'National Council (Czech Republic)', + '#parliament.PSP': 'Poslanecká sněmovna Parlamentu (Czech Republic)', + '#senate.SE': 'Senát Parlamentu (Czech Republic)', + '#senate.SE14': 'Senát Parlamentu (Czech Republic)', + '#FT': 'Folketing (Denmark)', + '#ee_parliament': 'Riigikogu (Estonia)', + '#CD': 'Cortes Generales (Spain)', + '#PC': 'Parlament de Catalunya', + '#PG': 'Parlamento de Galicia', + '#ES-PV': 'Eusko Legebiltzarra (Basque country)', + '#fi_parliament': 'Suomen eduskunta (Finland)', + '#ParlaMint-FR-LOWER': 'Assemblée nationale (France)', + '#parliament.HC': 'House of Commons (UK)', + '#parliament.HL': 'House of Lords (UK)', + '#PoGR': 'Βουλή των Ελλήνων (Greece)', + '#HS': 'Hrvatski sabor (Croatia)', + '#OGY': 'Országgyűlés (Hungary)', + '#LV': 'Alþingi (Iceland)', + '#LEG': 'Parlamento italiano', + '#PT': 'Saeima (Latvia)', + '#EK': 'Eerste kamer (Netherlands)', + '#TK': 'Tweede kamer (Netherlands)', + '#ST': 'Storting (Norway)', + '#LT': 'Lagting (Norway)', + '#OT': 'Odelsting (Norway)', + '#parliament.Sejm': 'Sejm (Poland)', + '#parliament.Senat': 'Senat (Poland)', + '#Parliament': 'Assembleia da República (Portugal)', + '#NS': 'Narodna skupština (Serbia)', + '#Riksdagen': 'Riksdagen (Sweden)', + '#DZ': 'Državni zbor Republike Slovenije', + '#TBMM': 'Türkiye Büyük Millet Meclisi', + '#ВРУ': 'Верховна Рада України', +} + +DATE_RANGES = { + 'AT': {'min_year': 1996, 'max_year': 2022}, + 'BA': {'min_year': 1998, 'max_year': 2022}, + 'BE': {'min_year': 2014, 'max_year': 2022}, + 'BG': {'min_year': 2014, 'max_year': 2022}, + 'CZ': {'min_year': 2013, 'max_year': 2022}, + 'DK': {'min_year': 2014, 'max_year': 2022}, + 'EE': {'min_year': 2011, 'max_year': 2022}, + 'ES': {'min_year': 2014, 'max_year': 2022}, + 'ES-CT': {'min_year': 2014, 'max_year': 2022}, + 'ES-GA': {'min_year': 2015, 'max_year': 2022}, + 'ES-PV': {'min_year': 2015, 'max_year': 2022}, + 'FI': {'min_year': 2015, 'max_year': 2022}, + 'FR': {'min_year': 2017, 'max_year': 2022}, + 'GB': {'min_year': 2015, 'max_year': 2022}, + 'GR': {'min_year': 2015, 'max_year': 2022}, + 'HR': {'min_year': 2003, 'max_year': 2022}, + 'HU': {'min_year': 2014, 'max_year': 2022}, + 'IS': {'min_year': 2015, 'max_year': 2022}, + 'IT': {'min_year': 2013, 'max_year': 2022}, + 'LV': {'min_year': 2014, 'max_year': 2022}, + 'NL': {'min_year': 2014, 'max_year': 2022}, + 'NO': {'min_year': 1998, 'max_year': 2022}, + 'PL': {'min_year': 2015, 'max_year': 2022}, + 'PT': {'min_year': 2015, 'max_year': 2022}, + 'RS': {'min_year': 1997, 'max_year': 2022}, + 'SE': {'min_year': 1998, 'max_year': 2022}, + 'SI': {'min_year': 2000, 'max_year': 2022}, + 'TR': {'min_year': 2011, 'max_year': 2022}, + 'UA': {'min_year': 2002, 'max_year': 2022} +} + +LANGUAGES = { + 'AT': ['de', 'en'], + 'BA': ['bs', 'en'], + 'BE': ['nl', 'fr', 'en'], + 'BG': [], + 'CZ': [], + 'DK': [], + 'EE': [], + 'ES': [], + 'ES-CT': [], + 'ES-GA': [], + 'ES-PV': [], + 'FI': [], + 'FR': [], + 'GB': [], + 'GR': [], + 'HR': [], + 'HU': [], + 'IS': [], + 'IT': [], + 'LV': [], + 'NL': [], + 'NO': [], + 'PL': [], + 'PT': [], + 'RS': [], + 'SE': [], + 'SI': [], + 'TR': [], + 'UA': [] +} + +POLITICAL_ORIENTATIONS = { + 'L': 'Left', + 'C': 'Centre', + 'R': 'Right', + 'FL': 'Far Left', + 'FR': 'Far Right', + 'CL': 'Centre Left', + 'CR': 'Centre Right', + 'CCL': 'Centre to Centre Left', + 'CCR': 'Centre to Centre Right', + 'CLL': 'Centre Left to Left', + 'CRR': 'Centre Right to Right', + 'LLF': 'Left to Far Left', + 'RRF': 'Right to Far Right', + 'BT': 'Big Tent', + 'NP': 'Nonpartisan', + 'PP': 'Pirate Party', + 'SI': 'Single Issue', + 'SY': 'Syncretic', + 'NA': 'Unknown' +} diff --git a/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_extract.py b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_extract.py new file mode 100644 index 000000000..01bafd674 --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_extract.py @@ -0,0 +1,193 @@ +import os +from bs4 import BeautifulSoup + +from ianalyzer_readers.extract import XML, Combined, Metadata +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_transform import metadata_attribute_transform_func, transform_current_party_id + +def extract_person_data(node): + id = '#' + node['xml:id'] + surname = node.persName.surname.text.strip() if node.persName.surname else '' + forename = node.persName.forename.text.strip()if node.persName.forename else '' + name = ' '.join([forename, surname]) + role = node.persName.roleName.text.strip() if node.persName.roleName else None #too simple, needs to be able to have different values and be gotten from the org_node + gender = node.sex['value'].strip() if node.sex else None + + #get org id + is_org_node = lambda node: node.name in ['affliation', 'affiliation'] and node.has_attr('ref') + org_nodes = node.find_all(is_org_node) + org_ids = [org_node['ref'] if org_node else None for org_node in org_nodes] + birth_date = node.birth['when'] if node.birth else None + birth_year = int(birth_date[:4]) if birth_date else None + birthplace = node.birth.placeName.text.strip() if node.birth and node.birth.placeName else None + + wikimedia_uri_node = node.find('idno', type='URI', subtype='wikimedia') + wikimedia_uri = wikimedia_uri_node.text if wikimedia_uri_node else None + + twitter_uri_node = node.find('idno', type='URI', subtype='twitter') + twitter_uri = twitter_uri_node.text if twitter_uri_node else None + + return { + 'id': id, + 'name': name, + 'role': role, + 'gender': gender, + 'org_ids': org_ids, + 'org_nodes': org_nodes, + 'birth_year': birth_year, + 'birthplace': birthplace, + 'wikimedia': wikimedia_uri, + 'twitter': twitter_uri, + } + + +def extract_people_data(soup): + ''' + Takes a soup and searches it for person nodes and adds all metadata + to a dictionary. + + Returns: a dictionary with a metadata dict for each person ID. + ''' + person_nodes = soup.find_all('person') + person_data = map(extract_person_data, person_nodes) + return { + person['id']: person for person in person_data + } + + +def get_persons_metadata(directory, country_code): + with open(os.path.join(directory, 'ParlaMint-{}-listPerson.xml'.format(country_code)), 'rb') as f: + soup = BeautifulSoup(f.read(), 'xml') + return extract_people_data(soup) + + +def extract_org_data(node): + id = node['xml:id'] + + full_name_node = node.find('orgName', full='yes') + full_name = full_name_node.text if full_name_node else None + + abbreviation_node = node.find('orgName', full='abb') + name = abbreviation_node.text if abbreviation_node else full_name or id + + wikimedia_uri_node = node.find('idno', type='URI', subtype='wikimedia') + wikimedia_uri = wikimedia_uri_node.text if wikimedia_uri_node else None + + political_orientation_node = node.find('state', type='politicalOrientation') + political_orientation = political_orientation_node.find('state')['ana'] if political_orientation_node else None + + return { + 'name': name, + 'full_name': full_name, + 'org_role': node['role'], + 'id': id, + 'wikimedia': wikimedia_uri, + 'political_orientation': political_orientation + } + + +def extract_all_org_data(soup): + orgs_list = soup.find('listOrg') + org_data = map(extract_org_data, orgs_list.find_all('org')) + make_id = lambda name: '#' + name + org_dict = {} + for org in org_data: + org_dict[make_id(org['id'])] = org + return org_dict + + +def get_orgs_metadata(directory, country_code): + with open(os.path.join(directory, 'ParlaMint-{}-listOrg.xml'.format(country_code)), 'rb') as f: + soup = BeautifulSoup(f.read(), 'xml') + return extract_all_org_data(soup) + +def get_party_list(org_metadata: dict) -> list: + '''runs through organisational metadata to find the political parties''' + party_list = [] + for org in org_metadata.keys(): + if 'org_role' in org_metadata[org] and org_metadata[org]['org_role'] in ['parliamentaryGroup', 'politicalParty']: + party_list.append(org) + return party_list + + +def extract_named_entities(xml_file: str) -> dict: + '''Extract the named entities from the xml file, and save them, ordered by speech id, + in a dictionary, which will be used to populate the NER keyword fields''' + with open(xml_file) as f: + soup = BeautifulSoup(f, 'xml') + speeches = soup.find_all("u") + output = dict() + for speech in speeches: + annotations_dict = {"LOC": list(), "MISC": list(), "ORG": list(), "PER": list()} + annotations = speech.find_all("name") + for annotation in annotations: + annotated = " ".join([word.string for word in annotation.find_all("w") if word.string]) + annotations_dict[annotation["type"]].append(annotated) + output[speech["xml:id"]] = annotations_dict + return output + + +def person_attribute_extractor(attribute, id_attribute = 'who'): + """Extractor that finds the speaker ID and returns one of the person's + attributes defined in extract_person_data()""" + return Combined( + XML(attribute=id_attribute), + Metadata('persons'), + transform = metadata_attribute_transform_func(attribute), + ) +def current_party_id_extractor(): + """Extractor that finds the current party, given a date + if no date is given, it return the last party in the node""" + return Combined( + person_attribute_extractor('id'), + Metadata('persons'), + Metadata('party_list'), + Metadata('date'), + transform=transform_current_party_id + ) + +def party_attribute_extractor(attribute): + """Extractor that finds the speaker's party and party's + attributes defined in extract_party_data()""" + return Combined( + current_party_id_extractor(), + Metadata('organisations'), + transform = metadata_attribute_transform_func(attribute), + ) + +def extract_speech(element): + """ + Extracts all string values from the given BeautifulSoup element (in this case + one sentence from a speech) and joins them into a single string. + Preserves spaces between words but not between words and punctuation. + + Args: + element (bs4.PageElement): The BeautifulSoup element to process. + + Returns: + str: A single string containing a sentence. + """ + sentence = [] + for string in element.stripped_strings: + # Dealing with punctuation + if string in [',', '.', '!', '?', ';', ':', ')', ']', '}']: + if sentence and sentence[-1].endswith(' '): + sentence[-1] = sentence[-1][:-1] + string + ' ' + else: + sentence.append(string + ' ') + elif string in ["-"]: # hyphenated-words + if sentence and sentence[-1].endswith(' '): + sentence[-1] = sentence[-1][:-1] + string + else: + sentence.append(string) + elif string.startswith("'"): # contractions are stored as "'s", "'m" + if sentence and sentence[-1].endswith(' '): + sentence[-1] = sentence[-1][:-1] + string + ' ' + else: + sentence.append(string + ' ') + elif string in ['(', '[', '{']: + sentence.append(string) + else: + sentence.append(string + ' ') + if len(sentence) > 1: + sentence[-1] = sentence[-1][:-1] if sentence[-1].endswith(' ') else sentence[-1] #trailspaces + return ''.join(sentence) \ No newline at end of file diff --git a/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_transform.py b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_transform.py new file mode 100644 index 000000000..2769de57f --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/parlamint_utils/parlamint_transform.py @@ -0,0 +1,123 @@ +import os +from bs4.element import NavigableString + +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_constants import POLITICAL_ORIENTATIONS, COUNTRY_PARLIAMENTS, COUNTRY_GOVERNMENTS + +def transform_xml_filename(filepath, country_extension): + '''transforms the original-version xml file path to the machine-translated file path''' + filename = os.path.basename(filepath) + transformed_filename = filename.replace(f"ParlaMint-{country_extension}", f"ParlaMint-{country_extension}-en") + return transformed_filename + +def metadata_attribute_transform_func(attribute): + """ + Creates a transformation function that extracts and cleans a specific + attribute from a collection. + """ + def get_attribute(which, collection): + if which and collection and which in collection: + value = collection[which][attribute] + return clean_value(value) + + return lambda values: get_attribute(*values) + +def clean_value(value): + if type(value) == str or type(value) == NavigableString: + stripped = value.strip() + if len(stripped): + return stripped + if type(value) == int or type(value) == float: + return value + return value + + +def transform_political_orientation(full_string): + if full_string and '#orientation.' in full_string: + return POLITICAL_ORIENTATIONS[full_string.split('.')[1]] + else: + return None + +def node_is_current(node, date): + """Checks if the node is current at the given date + i.e. if the date is between the from and to dates of the node""" + if node and date: + start_date = node.get('from', None) + end_date = node.get('to', None) + if (start_date and end_date and start_date <= date <= end_date) or \ + (start_date and start_date <= date) or \ + (end_date and end_date >= date): + return True + else: + return False + else: + return False + +def transform_parliamentary_role(data): + check_mp = lambda node, date : node['ref'] in COUNTRY_PARLIAMENTS[country] and node['role'] == 'member' and node_is_current(node, date) + org_nodes, date, country = data + if not date or not country or org_nodes is None: + return None + if any(check_mp(node, date) for node in org_nodes): + return 'MP' + else: + return 'non-MP' + +def transform_ministerial_role(data): + org_nodes, date, country = data + if not org_nodes: + return None + for node in org_nodes: + if COUNTRY_GOVERNMENTS[country] in node['ref'] and node['role'] == 'minister' and node_is_current(node, date): + for child_node in node.children: + if child_node.name == 'roleName' and child_node.get('xml:lang') == 'en': + return child_node.text.strip() + +def transform_government(data): + ''' + Checks a person's org_nodes for membership to a government + + Data contain: + - org_nodes: list of BS4 nodes + - date: string of date (YYYY-MM-DD) + - country: string of country code (e.g. 'NL') + ''' + org_nodes, date, country = data + if not org_nodes: + return 'Non-government' + for node in org_nodes: + if node['ref'] == COUNTRY_GOVERNMENTS[country] and node_is_current(node, date): + return 'Government' + else: + return 'Non-government' + +def transform_current_party_id(data): + ''' + looks up party affiliation for a person at a given date. + The input data contain: + - id (string) + - persons (dictionary: {id (string): person_metadata (dict)}) + - party_list [str], list of party names + - date (string YYYY-MM-DD) + + returns: + str: a single party name + ''' + id, persons, party_list, date = data + if not id or not persons or not party_list or not date: + return 'NA' + current_parties = [] + is_party_node = lambda node: node['ref'] in party_list + party_nodes = [node for node in persons[id]['org_nodes'] if node and is_party_node(node)] + if len(party_nodes) == 0: + return 'NA' + + for node in party_nodes: + if node_is_current(node, date): + current_parties.append(node['ref']) + + if len(current_parties) == 1: + return current_parties[0] + elif len(current_parties) == 0: + return party_nodes[-1].get('ref', 'NA') + else: + return current_parties[-1] #return the last org in the list since that is usually the most recent one. \ No newline at end of file diff --git a/backend/corpora/parliament/clarin_parlamint/wm/documentation.md b/backend/corpora/parliament/clarin_parlamint/wm/documentation.md new file mode 100644 index 000000000..fa8377a5d --- /dev/null +++ b/backend/corpora/parliament/clarin_parlamint/wm/documentation.md @@ -0,0 +1 @@ +ParlaMint Turkiye does not currently include a word model. diff --git a/backend/corpora/parliament/ireland.py b/backend/corpora/parliament/ireland.py index ec8a37517..991769144 100644 --- a/backend/corpora/parliament/ireland.py +++ b/backend/corpora/parliament/ireland.py @@ -15,7 +15,8 @@ import corpora.parliament.utils.field_defaults as field_defaults import corpora.utils.formatting as formatting from corpora.utils.filter_sources import in_date_range -import corpora.parliament.utils.parlamint as parlamint +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_transform import metadata_attribute_transform_func +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_extract import person_attribute_extractor def format_mininster_role(position, department): @@ -297,7 +298,7 @@ def role_extractor(role_type): Metadata('roles'), transform = compose( role_type_filter(role_type), - parlamint.metadata_attribute_transform_func('name') + metadata_attribute_transform_func('name') ), ) @@ -371,7 +372,7 @@ def sources(self, start, end): party_id = field_defaults.party_id() speaker = field_defaults.speaker() - speaker.extractor = parlamint.person_attribute_extractor( + speaker.extractor = person_attribute_extractor( 'name', id_attribute = 'by' ) diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py index 788ad6fc2..abf5b7943 100644 --- a/backend/corpora/parliament/netherlands.py +++ b/backend/corpora/parliament/netherlands.py @@ -9,9 +9,6 @@ from addcorpus.python_corpora.corpus import XMLCorpusDefinition from ianalyzer_readers.extract import XML, Constant, Combined, Order -from corpora.parliament.utils.parlamint import ( - party_attribute_extractor, -) from corpora.parliament.utils.parlamint_v4 import ( current_party_id_extractor, extract_named_entities, diff --git a/backend/corpora/parliament/parliament.py b/backend/corpora/parliament/parliament.py index 1612270b3..811fa35a2 100644 --- a/backend/corpora/parliament/parliament.py +++ b/backend/corpora/parliament/parliament.py @@ -39,7 +39,7 @@ class Parliament(CorpusDefinition): @property def es_settings(self): - return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True) + return es_settings(self.languages, stopword_analysis=True, stemming_analysis=True) diff --git a/backend/corpora/parliament/tests/test_es_settings.py b/backend/corpora/parliament/tests/test_es_settings.py index f554fae15..f25965a94 100644 --- a/backend/corpora/parliament/tests/test_es_settings.py +++ b/backend/corpora/parliament/tests/test_es_settings.py @@ -3,7 +3,7 @@ import os import shutil -def test_stopwords(clean_nltk_data_directory, settings, connected_to_internet): +def test_stopwords(clean_nltk_data_directory, settings): """ Check that stopwords results are valid and all languages are included """ @@ -44,7 +44,7 @@ def test_stopwords(clean_nltk_data_directory, settings, connected_to_internet): ] for case in cases: - stopwords = es_settings.get_nltk_stopwords(case['language']) + stopwords = es_settings.get_stopwords(case['language']) for word in case['stopwords']: assert word in stopwords diff --git a/backend/corpora/parliament/utils/field_defaults.py b/backend/corpora/parliament/utils/field_defaults.py index 7514ee0ef..979dcbcbc 100644 --- a/backend/corpora/parliament/utils/field_defaults.py +++ b/backend/corpora/parliament/utils/field_defaults.py @@ -67,6 +67,7 @@ def country(): name='country', display_name='Country', description='Country in which the debate took place', + searchable=False, es_mapping=keyword_mapping(), ) @@ -77,6 +78,7 @@ def date(min_date: datetime = MIN_DATE, max_date: datetime = MAX_DATE): name="date", display_name="Date", description="The date on which the debate took place.", + searchable=False, es_mapping=date_mapping(), results_overview=True, search_filter=DateFilter( @@ -186,6 +188,7 @@ def debate_id(): name='debate_id', display_name='Debate ID', description='Unique identifier of the debate in which the speech was held', + searchable=False, es_mapping=keyword_mapping(), ) @@ -284,12 +287,37 @@ def speech(language=None): language=language, ) +def speech_translated(): + """ + speech is a multifield with subfields clean (lowercase, stopwords, no numbers) and stemmed (as clean, but also stemmed) + stopword and stemmer filter need to be defined for each language + """ + return FieldDefinition( + name='speech_translated', + display_name='Speech (machine-translated)', + description='The transcribed speech, machine-translated to English', + # each index has its own definition of the 'clean' and 'stemmed' analyzer, based on language + es_mapping = main_content_mapping( + token_counts=False, + stopword_analysis=True, + stemming_analysis=True, + language='en', + ), + results_overview=False, + search_field_core=True, + display_type='text_content', + visualizations=[], + csv_core=True, + language='en', + ) + def speech_id(): "unique (corpus-level) ID for the speech" return FieldDefinition( name='id', display_name='Speech ID', description='Unique identifier of the speech', + searchable=False, es_mapping=keyword_mapping(), csv_core=True, ) @@ -313,6 +341,7 @@ def speech_type(): name='speech_type', display_name='Speech type', description='The type of speech', + searchable=False, es_mapping=keyword_mapping(), ) @@ -322,6 +351,7 @@ def speaker_id(): name='speaker_id', display_name='Speaker ID', description='Unique identifier of the speaker', + searchable=False, es_mapping=keyword_mapping(), ) @@ -331,6 +361,7 @@ def speaker_constituency(): name='speaker_constituency', display_name='Speaker constituency', description='Constituency represented by the speaker', + searchable=False, es_mapping=keyword_mapping(), visualizations=['resultscount', 'termfrequency'] ) @@ -342,6 +373,7 @@ def speaker_birthplace(): display_name='Speaker place of birth', description='Birthplace of the speaker', es_mapping=keyword_mapping(), + searchable=False, ) def speaker_birth_country(): @@ -350,6 +382,7 @@ def speaker_birth_country(): name='speaker_birth_country', display_name='Speaker country of birth', description='Country in which the speaker was born', + searchable=False, es_mapping=keyword_mapping(), ) @@ -377,6 +410,7 @@ def speaker_gender(): name='speaker_gender', display_name='Speaker gender', description='Gender of the speaker', + searchable=False, es_mapping=keyword_mapping(), visualizations=['resultscount', 'termfrequency'], ) @@ -387,6 +421,7 @@ def speaker_profession(): name='speaker_profession', display_name='Speaker profession', description='Profession of the speaker', + searchable=False, es_mapping=keyword_mapping(), ) @@ -396,6 +431,7 @@ def speaker_aristocracy(): name='speaker_aristocracy', display_name='Speaker aristocracy', description='Aristocratic title of the speaker', + searchable=False, es_mapping=keyword_mapping(), ) @@ -405,6 +441,7 @@ def speaker_academic_title(): name='speaker_academic_title', display_name='Speaker academic title', description='Academic title of the speaker', + searchable=False, es_mapping=keyword_mapping(), ) @@ -415,6 +452,7 @@ def parliamentary_role(): name='role', display_name='Parliamentary role', description='Role of the speaker in parliament', + searchable=False, es_mapping=keyword_mapping(), search_filter=MultipleChoiceFilter( description='Search for speeches by speakers with the selected roles', @@ -428,6 +466,7 @@ def ministerial_role(): name='ministerial_role', display_name='Ministerial role', description='Ministerial role of the speaker', + searchable=False, es_mapping=keyword_mapping(), search_filter=MultipleChoiceFilter( description='Search for speeches by speakers with the selected ministerial roles', @@ -450,6 +489,7 @@ def party(): name='party', display_name='Party', description='Political party that the speaker belongs to', + searchable=False, es_mapping=keyword_mapping(), search_filter= MultipleChoiceFilter( description='Search in speeches from the selected parties', @@ -464,6 +504,7 @@ def party_id(): name='party_id', display_name='Party ID', description='Unique identifier of the political party the speaker belongs to', + searchable=False, es_mapping=keyword_mapping(), ) @@ -473,6 +514,7 @@ def party_full(): name='party_full', display_name='Party (full name)', description='Full name of the political party that the speaker belongs to', + searchable=False, es_mapping=text_mapping(), ) @@ -484,6 +526,7 @@ def party_role(): name='party_role', display_name='Party role', description='Role of the speaker\'s political party in parliament at the time of speaking', + searchable=False, es_mapping=keyword_mapping(), search_filter= MultipleChoiceFilter( description='Search in speeches from the selected parties', @@ -508,7 +551,8 @@ def page_source(): name='page_source', display_name='Source page number', description='Page number in source document', - es_mapping=keyword_mapping() + es_mapping=keyword_mapping(), + searchable=False, ) diff --git a/backend/corpora/parliament/utils/parlamint.py b/backend/corpora/parliament/utils/parlamint.py index 87fa7ba81..53a3ec24f 100644 --- a/backend/corpora/parliament/utils/parlamint.py +++ b/backend/corpora/parliament/utils/parlamint.py @@ -1,5 +1,20 @@ from ianalyzer_readers.extract import XML, Combined, Metadata from bs4.element import NavigableString +from bs4.element import NavigableString, Tag as Node +from string import punctuation +from typing import Iterable + +from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping +from ianalyzer_readers.xml_tag import Tag +from addcorpus.python_corpora.filters import MultipleChoiceFilter +from addcorpus.python_corpora.corpus import FieldDefinition + +""" +This file remains in place, but it is outdated. More recent parlamint logic can be found in +backend/corpora/parliament/clarin_parlamint/parlamint_utils. This file is kept for backwards +compatibility for the Finland corpus. +""" + def clean_value(value): if type(value) == str or type(value) == NavigableString: @@ -108,3 +123,110 @@ def party_attribute_extractor(attribute): Metadata('parties'), transform = metadata_attribute_transform_func(attribute), ) + + +def get_entity_shorthand(entity: str): + if entity == "location": + return "LOC" + elif entity == "miscellaneous": + return "MISC" + elif entity == "organization": + return "ORG" + else: + return "PER" + + +def get_entity_list(extracted_data: tuple[str, dict], entity: str) -> list[str]: + '''collect all named entities for the processed speech and this category + + Parameters: + extracted_data: tuple of the speech id and the metadata dictionary + entity: string of the entity class (location /misc / organization / person) + + ''' + speech_id, metadata = extracted_data + shorthand = get_entity_shorthand(entity) + return list(set(metadata.get(speech_id).get(shorthand))) + + +def ner_keyword_field(entity: str): + return FieldDefinition( + name=f"{entity}:ner-kw", + display_name=f"Named Entity: {entity.capitalize()}", + searchable=False, + es_mapping=keyword_mapping(enable_full_text_search=False), + search_filter=MultipleChoiceFilter( + description=f"Select only speeches which contain this {entity} entity", + option_count=100, + ), + extractor=Combined( + XML(attribute="xml:id"), + Metadata("ner"), + transform=lambda x: get_entity_list(x, entity), + ), + ) + + +def detokenize_parlamint(tokens: Iterable[str]) -> str: + """Detokenize the content of `w` and `pc` tags in the ParlaMint XML + The `join="right"` attribute indicates that there should not be whitespace after the word + """ + output = "" + for token in tokens: + if token.get("join") != "right": + output += f"{token.string} " + else: + output += token.string if token.string else '' + # do not include the last character (always whitespace) in the output + return output[:-1] + + +def format_annotated_segment(element: Node) -> str: + """For each tag, extract the annotations indicated by """ + annotations = element.find_all("name") + formatted_annotations = [format_annotated_text(anno) for anno in annotations] + return "".join(formatted_annotations) + + +def format_annotated_text(element: Node) -> str: + """For each tag, format the annotation in Elasticsearch's annotated_text format, + and embed it in the text extracted from adjoining and tags + """ + output = "" + tokens = [el.extract() for el in element.find_previous_siblings(["w", "pc"])] + output += detokenize_parlamint(reversed(tokens)) + annotated = element.find_all("w") + formatted = " ".join([a.string for a in annotated if a.string]) + if output: + # if there is preceding text, add whitespace prior to annotation + output += " " + output += f"[{formatted}]({element['type']})" + if not element.find_next_sibling("name"): + # after last annotation, add remaining text + remaining_text = detokenize_parlamint(element.find_next_siblings(["w", "pc"])) + if remaining_text and remaining_text[0] not in punctuation: + # remaining text does not start with punctuation: add whitespace + output += " " + output += remaining_text + return output + + +def speech_ner(): + return FieldDefinition( + name="speech:ner", + hidden=True, + es_mapping=non_indexed_text_mapping(), + display_type="text_content", + searchable=False, + extractor=XML( + Tag("seg"), + multiple=True, + extract_soup_func=format_annotated_segment, + transform=lambda x: "\n".join(x), + ), + ) + + +def extract_speech(segment: Node) -> str: + text_nodes = segment.find_all(["w", "pc"]) + return detokenize_parlamint(text_nodes) diff --git a/backend/corpora/parliament/utils/parlamint_v4.py b/backend/corpora/parliament/utils/parlamint_v4.py index 66e7988bb..ac95e60b1 100644 --- a/backend/corpora/parliament/utils/parlamint_v4.py +++ b/backend/corpora/parliament/utils/parlamint_v4.py @@ -1,3 +1,5 @@ +from ianalyzer_readers.extract import XML, Combined, Metadata +from bs4.element import NavigableString from glob import glob from string import punctuation from typing import Iterable @@ -10,15 +12,14 @@ from addcorpus.es_mappings import non_indexed_text_mapping, keyword_mapping from addcorpus.python_corpora.corpus import FieldDefinition from addcorpus.python_corpora.filters import MultipleChoiceFilter -from corpora.parliament.utils.parlamint import ( - metadata_attribute_transform_func, - person_attribute_extractor, -) +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_transform import metadata_attribute_transform_func +from corpora.parliament.clarin_parlamint.parlamint_utils.parlamint_extract import person_attribute_extractor """ This file was created as an updated utils file for the ParlaMint dataset, version 4.0. The previous utils file -is based on version 2.0. +is based on version 2.0. A more recent version of logic that exists in this file can be found in +backend/corpora/parliament/clarin_parlamint/. This file is only kept for its use in People & Parliament. """ POLITICAL_ORIENTATIONS = { @@ -144,6 +145,37 @@ def extract_people_data(soup): person['id']: person for person in person_data } +def extract_role_data(soup): + role_nodes = soup.find('encodingDesc').find_all('category') + # return dict that maps IDs to terms data contains duplicate role IDs + # go through data in reverse order so earlier (more general) terms + # overwrite later (more specific) ones + return { + node['xml:id']: node.find('term').text.strip() + for node in reversed(role_nodes) + } + +def metadata_attribute_transform_func(attribute): + """ + Creates a transformation function that extracts and cleans a specific + attribute from a collection. + """ + def get_attribute(which, collection): + if which and collection and which in collection: + value = collection[which][attribute] + return clean_value(value) + + return lambda values: get_attribute(*values) + +def person_attribute_extractor(attribute, id_attribute = 'who'): + """Extractor that finds the speaker ID and returns one of the person's + attributes defined in extract_person_data()""" + return Combined( + XML(attribute=id_attribute), + Metadata('persons'), + transform = metadata_attribute_transform_func(attribute), + ) + def current_party_id_extractor(): """Extractor that finds the current party, given a date if no date is given, it return the last party in the node""" @@ -202,8 +234,8 @@ def ner_keyword_field(entity: str): return FieldDefinition( name=f"{entity}:ner-kw", display_name=f"Named Entity: {entity.capitalize()}", - searchable=True, - es_mapping=keyword_mapping(enable_full_text_search=True), + searchable=False, + es_mapping=keyword_mapping(enable_full_text_search=False), search_filter=MultipleChoiceFilter( description=f"Select only speeches which contain this {entity} entity", option_count=100, @@ -266,7 +298,7 @@ def speech_ner(): hidden=True, es_mapping=non_indexed_text_mapping(), display_type="text_content", - searchable=True, + searchable=False, extractor=XML( Tag("seg"), multiple=True, @@ -320,3 +352,28 @@ def extract_named_entities(xml_file: str) -> dict: annotations_dict[annotation["type"]].append(annotated) output[speech["xml:id"]] = annotations_dict return output + +def extract_party_data(node): + id = node['xml:id'] + + full_name_node = node.find('orgName', full='yes') + full_name = full_name_node.text if full_name_node else None + + abbreviation_node = node.find('orgName', full='init') + name = abbreviation_node.text if abbreviation_node else full_name or id + + return { + 'name': name, + 'full_name': full_name, + 'role': node['role'], + 'id': id + } + +def extract_all_party_data(soup): + parties_list = soup.find('listOrg') + party_data = map(extract_party_data, parties_list.find_all('org')) + make_id = lambda name: '#party.' + name if not name.startswith('party.') else '#' + name + + return { + make_id(party['id']): party for party in party_data + } \ No newline at end of file diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py index f5ad6eaba..ed9390c16 100644 --- a/backend/ianalyzer/common_settings.py +++ b/backend/ianalyzer/common_settings.py @@ -136,7 +136,7 @@ LOGO_LINK = 'https://dhstatic.hum.uu.nl/logo-cdh/png/UU_CDH_logo_EN_whiteFC.png' -NLTK_DATA_PATH = os.path.join(BASE_DIR, 'addcorpus', 'nltk_data') +NLTK_DATA_PATH = os.path.join(BASE_DIR, 'addcorpus', 'stopword_data', 'nltk_data') DEFAULT_CORPUS_IMAGE = os.path.join(BASE_DIR, 'addcorpus', 'images', 'default.png') diff --git a/backend/visualization/wordcloud.py b/backend/visualization/wordcloud.py index b467d4e14..40b179edd 100644 --- a/backend/visualization/wordcloud.py +++ b/backend/visualization/wordcloud.py @@ -2,14 +2,14 @@ from sklearn.feature_extraction.text import CountVectorizer from addcorpus.models import Corpus -from addcorpus.es_settings import get_nltk_stopwords +from addcorpus.es_settings import get_stopwords from es import download as download def field_stopwords(corpus_name, field_name): corpus = Corpus.objects.get(name=corpus_name) field = corpus.configuration.fields.get(name=field_name) if field.language and field.language != 'dynamic': - return get_nltk_stopwords(field.language) + return get_stopwords(field.language) else: return [] diff --git a/documentation/Corpus-definitions.md b/documentation/Corpus-definitions.md index e104bfcf1..371533cf9 100644 --- a/documentation/Corpus-definitions.md +++ b/documentation/Corpus-definitions.md @@ -2,7 +2,7 @@ Corpus definitions are the way that we configure each corpus in Textcavator. -This documents gives a basic explanation of how corpus definitions work in the backend. It introduces the core concepts and mechanics. +This document gives a basic explanation of how corpus definitions work in the backend. It introduces the core concepts and mechanics. ## Corpus definitions diff --git a/documentation/Writing-a-corpus-definition-in-Python.md b/documentation/Writing-a-corpus-definition-in-Python.md index d711959a1..019e31ff6 100644 --- a/documentation/Writing-a-corpus-definition-in-Python.md +++ b/documentation/Writing-a-corpus-definition-in-Python.md @@ -105,7 +105,7 @@ Documentation pages must be markdown files. See [corpus documentation](/document ## Definining fields -The `fields` property lists the configuration for each field in the corpus. Each of these defines how that field should be extracted from the source file, how it should be stored in elasticsearch, and how it should appear in the interface. See [corpus.py](../backend/addcorpus/corpus.py) for the class definition. +The `fields` property lists the configuration for each field in the corpus. Each of these defines how that field should be extracted from the source file, how it should be stored in elasticsearch, and how it should appear in the interface. See [corpus.py](../backend/addcorpus/python_corpora/corpus.py) for the class definition. Note that unlike with `CorpusDefinition`, fields are not defined as _classes_ but as _objects_. Rather than creating a custom subclass of `FieldDefinition`, you can just call the `FieldDefinition()` constructor with appropriate parameters.