From f0d0fad5a9c72d6b9c94ccf368bb735d476cb8f3 Mon Sep 17 00:00:00 2001 From: Vishal Venkat Raghavan Date: Thu, 8 Feb 2024 14:00:25 +0100 Subject: [PATCH 01/78] Citation extractions by Vishal for code review --- cellar/cellar_extractor/citations.py | 39 ++++++++++++++++++++++++++++ cellar/cellar_extractor/para.py | 39 ++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 cellar/cellar_extractor/citations.py create mode 100644 cellar/cellar_extractor/para.py diff --git a/cellar/cellar_extractor/citations.py b/cellar/cellar_extractor/citations.py new file mode 100644 index 0000000..bfe43b1 --- /dev/null +++ b/cellar/cellar_extractor/citations.py @@ -0,0 +1,39 @@ +import requests +from bs4 import BeautifulSoup + +def get_citations_from_celex_id(celex)->list:#Get citations(Celex ID) from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text + parser=BeautifulSoup(website,'lxml') + div=parser.find_all('div',class_="panel-body") + citations=[] + for divs in div: + if divs!=None: + dl=divs.find('dl',class_="NMetadata") + if dl!=None: + dt=dl.find_all('dt') + for dls in dl: + if "cited" in dls.text.lower(): + + + temp=dls.find_all_next('dd') + for dd in temp: + if dd!=None: + li=dd.find_all('li') + for mentions in li: + if mentions!=None: + a=mentions.find('a') + if a!=None: + + citations.append(a.text) + # print(a.text) + # print(citations) + filtered=[] + for splits in citations: + if len(splits.split(" "))<2: + filtered.append(splits) + + return filtered + + +sample=get_citations_from_celex_id("61962CJ0026") +print(sample) \ No newline at end of file diff --git a/cellar/cellar_extractor/para.py b/cellar/cellar_extractor/para.py new file mode 100644 index 0000000..5324f28 --- /dev/null +++ b/cellar/cellar_extractor/para.py @@ -0,0 +1,39 @@ +import requests +from bs4 import BeautifulSoup + +def get_para_citations_from_celex_id(celex)->list:#Get paragraph citations from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text + parser=BeautifulSoup(website,'lxml') + div=parser.find_all('div',class_="panel-body") + citations=[] + for divs in div: + if divs!=None: + dl=divs.find('dl',class_="NMetadata") + if dl!=None: + dt=dl.find_all('dt') + for dls in dl: + if "cited" in dls.text.lower(): + + + temp=dls.find_all_next('dd') + for dd in temp: + if dd!=None: + li=dd.find_all('li') + for mentions in li: + if mentions!=None: + if "p" in mentions.text.lower().split(" "): + + + # print(mentions.text) + citations.append(mentions.text) + # print(a.text) + # print(citations) + filtered=[] + for splits in citations: + + filtered.append(splits.split(":")[1]) + + return filtered + +sample=get_para_citations_from_celex_id("61962CJ0026") +print(sample) \ No newline at end of file From 7868c30a4c0b6db610129db65ea2acddff4db166 Mon Sep 17 00:00:00 2001 From: Vishal Venkat Raghavan Date: Thu, 8 Feb 2024 14:05:46 +0100 Subject: [PATCH 02/78] cellar --- cellar/cellar_extractor/citations.py | 39 ---------------------------- cellar/cellar_extractor/para.py | 39 ---------------------------- 2 files changed, 78 deletions(-) delete mode 100644 cellar/cellar_extractor/citations.py delete mode 100644 cellar/cellar_extractor/para.py diff --git a/cellar/cellar_extractor/citations.py b/cellar/cellar_extractor/citations.py deleted file mode 100644 index bfe43b1..0000000 --- a/cellar/cellar_extractor/citations.py +++ /dev/null @@ -1,39 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -def get_citations_from_celex_id(celex)->list:#Get citations(Celex ID) from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('div',class_="panel-body") - citations=[] - for divs in div: - if divs!=None: - dl=divs.find('dl',class_="NMetadata") - if dl!=None: - dt=dl.find_all('dt') - for dls in dl: - if "cited" in dls.text.lower(): - - - temp=dls.find_all_next('dd') - for dd in temp: - if dd!=None: - li=dd.find_all('li') - for mentions in li: - if mentions!=None: - a=mentions.find('a') - if a!=None: - - citations.append(a.text) - # print(a.text) - # print(citations) - filtered=[] - for splits in citations: - if len(splits.split(" "))<2: - filtered.append(splits) - - return filtered - - -sample=get_citations_from_celex_id("61962CJ0026") -print(sample) \ No newline at end of file diff --git a/cellar/cellar_extractor/para.py b/cellar/cellar_extractor/para.py deleted file mode 100644 index 5324f28..0000000 --- a/cellar/cellar_extractor/para.py +++ /dev/null @@ -1,39 +0,0 @@ -import requests -from bs4 import BeautifulSoup - -def get_para_citations_from_celex_id(celex)->list:#Get paragraph citations from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('div',class_="panel-body") - citations=[] - for divs in div: - if divs!=None: - dl=divs.find('dl',class_="NMetadata") - if dl!=None: - dt=dl.find_all('dt') - for dls in dl: - if "cited" in dls.text.lower(): - - - temp=dls.find_all_next('dd') - for dd in temp: - if dd!=None: - li=dd.find_all('li') - for mentions in li: - if mentions!=None: - if "p" in mentions.text.lower().split(" "): - - - # print(mentions.text) - citations.append(mentions.text) - # print(a.text) - # print(citations) - filtered=[] - for splits in citations: - - filtered.append(splits.split(":")[1]) - - return filtered - -sample=get_para_citations_from_celex_id("61962CJ0026") -print(sample) \ No newline at end of file From 905d223a8e32f1437415a655ac9ab66f77022da6 Mon Sep 17 00:00:00 2001 From: Vishal Venkat Raghavan Date: Tue, 27 Feb 2024 13:46:27 +0100 Subject: [PATCH 03/78] Updated code for extraction --- .../cellar_extractor/operative_extraction.py | 270 ++++++++++++++++++ cellar/cellar_extractor/output.py | 50 ++++ cellar/cellar_extractor/test_output.py | 14 + 3 files changed, 334 insertions(+) create mode 100644 cellar/cellar_extractor/operative_extraction.py create mode 100644 cellar/cellar_extractor/output.py create mode 100644 cellar/cellar_extractor/test_output.py diff --git a/cellar/cellar_extractor/operative_extraction.py b/cellar/cellar_extractor/operative_extraction.py new file mode 100644 index 0000000..c2247e2 --- /dev/null +++ b/cellar/cellar_extractor/operative_extraction.py @@ -0,0 +1,270 @@ +import requests +from bs4 import BeautifulSoup +import unittest +# class ECLI(): +# ecli:str +# def __init__(self,ecli): +# self.ecli=ecli +class Analyzer(): + celex:str + def __init__(self,celex): + self.celex=celex + + + def html_page_structure_one(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + div=parser.find_all('table') + one=[] + for divs in div: + table=divs.find('table') + if table!=None: + p=table.find_all('p',class_="coj-normal") + for x in p: + span=x.find_all('span',class_="coj-bold") + for y in span: + if x!=None and y!=None: + # print(span.text) + one.append(y.text) + return one + + + def html_page_structure_two(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + p=parser.find_all('p') + two=[] + for para in p: + # print(para) + span=para.find('span') + if span!=None: + # print(span.text) + if "operative" in span.text.lower(): + normal=span.find_all_next('p',class_="normal") + for op in normal: + # print(op.text) + two.append(op.text) + return two + + def structure_three(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + table=parser.find_all('table') + three=[] + for tables in table: + interior=tables.find_all('table') + for interiors in interior: + if interiors!=None: + p=interiors.find_all('p',class_="coj-normal") + for x in p: + span=x.find_all('span',class_="coj-bold") + for y in span: + if x!=None and y!=None: + # print(span.text) + three.append(y.text) + return three + + + + def structure_four(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + p=parser.find_all('p') + four=[] + for para in p: + # print(para) + span=para.find('span') + if span!=None: + # print(span.text) + if "operative" in span.text.lower(): + normal=span.find_all_next('table') + for op in normal: + tbody=op.find('tbody') + new_p=tbody.find_all('p',class_="oj-normal") + + + for subsequent in new_p: + if subsequent!=None: + # print(subsequent.text) + four.append(subsequent.text) + + + return four + + def structure_five(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + p=parser.find_all('p') + five=[] + for para in p: + # print(para) + span=para.find('span') + if span!=None: + # print(span.text) + if "operative" in span.text.lower(): + normal=span.find_all_next('table') + for op in normal: + tbody=op.find('tbody') + new_p=tbody.find_all('p',class_="normal") + + + for subsequent in new_p: + if subsequent!=None: + # print(subsequent.text) + five.append(subsequent.text) + + + return five + def structure_six(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + div=parser.find_all('h2') + six=[] + for h2 in div: + # print(h2.text) + if h2.text=="Operative part": + operatives=h2.find_all_next('p') + for operative in operatives: + # print(operative.text) + six.append(operative.text) + return six + def structure_seven(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + div=parser.find_all('table') + seven=[] + for divs in div: + table=divs.find_all('tbody') + for tables in table: + if tables!=None: + p=tables.find_all('tr') + for x in p: + if x!=None: + td=x.find_all('td') + for y in td: + if y!=None: + p=y.find_all('p',class_="normal") + for all in p: + if all!=None: + span=all.find_all('span',class_="bold") + for spans in span: + # print(spans.text) + seven.append(spans.text) + return seven + def structure_eight(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + tbody=parser.find_all('tbody') + eight=[] + for all in tbody: + if all!=None: + tr=all.find_all('tr') + for trs in tr: + if trs!=None: + # print(trs) + + p=parser.find_all('p',class_="normal") + for paras in p: + if paras!=None: + if "on those grounds" in paras.text.lower(): + + span=paras.find_all_next('span',class_="bold") + for spans in span: + if spans!=None: + eight.append(spans.text) + # print(spans.text) + + return eight + def structure_nine(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + nine=[] + div=parser.find_all('p') + for divs in div: + if divs!=None: + if "on those grounds" in divs.text.lower(): + b=divs.find_all_next('b') + for bolds in b: + # print(bolds.text) + nine.append(bolds.text) + return nine + def structure_eleven(self)->list: + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + bold = parser.find_all('b') + + eleven=[] + + # print(website) + + for b in bold: + if b!=None: + if "operative part" in b.text.lower(): + table=b.find_all_next('p') + for tables in table: + if tables!=None: + eleven.append(tables.text) + # print(tables.text) + + + + return eleven + def structure_ten(self): + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser=BeautifulSoup(website,'lxml') + appender=[] + for string in parser.stripped_strings: + # print(string) + appender.append(string) + # print(appender) + + found = False + afterGrounds = [] + + for x in appender: + + if "on those grounds" in x.lower(): + found = True + # print("True") + + if found: + if len(x.split(" "))>3: + afterGrounds.append(x) + return afterGrounds + def __call__(self)->list: + one:list + one=self.html_page_structure_one() + if len(one)==0 or len(one)=="\n": + one=self.html_page_structure_two() + if len(one)==0 or one[0]=="\n": + one=self.structure_three() + if len(one)==0 or one[0]=="\n": + one=self.structure_four() + if len(one)==0 or one[0]=="\n": + one=self.structure_five() + if len(one)==0 or one[0]=="\n": + one=self.structure_six() + if len(one)==0 or one[0]=="\n": + one=self.structure_seven() + if len(one)==0 or one[0]=="\n": + one=self.structure_eight() + if len(one)==0 or one[0]=="\n": + one=self.structure_nine() + if len(one)==0 or one[0]=="\n": + one=self.structure_ten() + if len(one)==0 or one[0]=="\n": + one=self.structure_eleven() + return one + + + + + + +instance=Analyzer("61980CJ0027") +x=instance() +if x!=None: + print(x) + + + diff --git a/cellar/cellar_extractor/output.py b/cellar/cellar_extractor/output.py new file mode 100644 index 0000000..2d5e07e --- /dev/null +++ b/cellar/cellar_extractor/output.py @@ -0,0 +1,50 @@ + +# from typing import Any +from operative_extractions import Analyzer +import csv +import json + +class Writing(): + + instance:str + x:str + def __init__(self, celex:str): + self.celex = celex + self.instance = Analyzer(self.celex) + self.x = self.instance() + + + def to_csv(self): + file=open("csv/output.csv","a+") + writer=csv.writer(file) + + if self.x!=None: + writer.writerow([self.celex,self.x]) + + def to_json(self): + if self.x!=None: + data={'Celex':self.celex,"Operative part":self.x} + file=open('json/data.json', 'a+') + json.dump(data,file) + file.close() + def to_txt(self): + + + if self.x!=None: + file=open(f"txt/{self.celex}.txt","a") + for w in self.x: + + file.write(w+"\n") + file.close() + def __call__(self): + self.to_csv() + # self.to_json() + # self.to_txt() + + + + + +# example=Writing("62018CA0390") +# example() + diff --git a/cellar/cellar_extractor/test_output.py b/cellar/cellar_extractor/test_output.py new file mode 100644 index 0000000..7e836bd --- /dev/null +++ b/cellar/cellar_extractor/test_output.py @@ -0,0 +1,14 @@ +import csv +file=open("gijs_202310_node_list.tsv","r") +reader=csv.reader(file) +from output import Writing +testing=[] +for row in reader: + for rows in row: + if "Id" not in rows: + testing.append(rows.split("\t")[0]) + +for all in testing: + instance=Writing(all) + instance() + # print(all) \ No newline at end of file From 36cc2394da9aae5b43d73f4bde45ee6ce0a8c5ae Mon Sep 17 00:00:00 2001 From: Vishal Venkat Raghavan Date: Wed, 6 Mar 2024 11:08:21 +0100 Subject: [PATCH 04/78] Updated code --- cellar/cellar_extractor/Testing_file.py | 50 ++++ .../cellar_extractor/operative_extraction.py | 243 ++++++++++++++---- cellar/cellar_extractor/output.py | 50 ---- cellar/cellar_extractor/test_output.py | 14 - 4 files changed, 236 insertions(+), 121 deletions(-) delete mode 100644 cellar/cellar_extractor/output.py delete mode 100644 cellar/cellar_extractor/test_output.py diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py index d2d81b3..a8a0805 100644 --- a/cellar/cellar_extractor/Testing_file.py +++ b/cellar/cellar_extractor/Testing_file.py @@ -14,6 +14,56 @@ from eurlex_scraping import * from cellar import * from sparql import * +import unittest +from operative_extraction import Analyzer +# from test import testing +import random +import csv +file=open("gijs_202310_node_list.tsv","r") +reader=csv.reader(file) +no_of_test_cases=30 +testing=[] +for row in reader: + for rows in row: + if "Id" not in rows: + testing.append(rows.split("\t")[0]) +class Test(unittest.TestCase): + """ + class for unittesing operative part , it checks whether the list returns null value or has some value. + """ + ids:list + def __init__(self,ids): + self.ids=ids + + def test_for_celex_id(self): + """ + Main function which runs the unittest Testcase . + """ + count_fail:int + count_pass=0 + for id in self.ids: + test_output=Analyzer(id) + test_instance=test_output() + + # self.assertFalse(len(test_instance)<=1) + + try: + self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") + count_pass+=1 + print(f"{id} ---> PASSED.") + except: + print(f"{id} ---> FAILED.") + print(f"Passed {count_pass}/{len(self.ids)} times") + # print(len(self.ids)-count,"were passed successfully") + +new_list=[] +for w in range(no_of_test_cases): + randomized=random.randint(0,len(testing)-1) + new_list.append(testing[randomized]) + + +instance=Test(new_list) +instance.test_for_celex_id() diff --git a/cellar/cellar_extractor/operative_extraction.py b/cellar/cellar_extractor/operative_extraction.py index c2247e2..72f471c 100644 --- a/cellar/cellar_extractor/operative_extraction.py +++ b/cellar/cellar_extractor/operative_extraction.py @@ -1,52 +1,66 @@ import requests from bs4 import BeautifulSoup import unittest -# class ECLI(): -# ecli:str -# def __init__(self,ecli): -# self.ecli=ecli +from operative_extraction import Analyzer +import csv +import json class Analyzer(): - celex:str - def __init__(self,celex): + """ + This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. + """ + celex:str # declare celex as a string + def __init__(self,celex):# Initialize Celex id as a constructor , passed when calling the class self.celex=celex def html_page_structure_one(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested + table structure . The relevant text lies inside the coj-bold class of the span tag. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') - div=parser.find_all('table') + div=parser.find_all('table') # Find all tables tag from the website one=[] for divs in div: - table=divs.find('table') + table=divs.find('table') # Find each nested table within the table if table!=None: - p=table.find_all('p',class_="coj-normal") + p=table.find_all('p',class_="coj-normal") # Find all p under the nested table with the coj-normal class for x in p: - span=x.find_all('span',class_="coj-bold") + span=x.find_all('span',class_="coj-bold")# Span class of coj-bold under the p tag for y in span: if x!=None and y!=None: - # print(span.text) - one.append(y.text) + + one.append(y.text)#append text from span onto a list return one def html_page_structure_two(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') p=parser.find_all('p') two=[] for para in p: - # print(para) + span=para.find('span') if span!=None: - # print(span.text) + if "operative" in span.text.lower(): normal=span.find_all_next('p',class_="normal") for op in normal: - # print(op.text) + two.append(op.text) return two def structure_three(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested + table structure . The relevant text lies inside the coj-bold class of the span tag. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') table=parser.find_all('table') @@ -60,22 +74,26 @@ def structure_three(self)->list: span=x.find_all('span',class_="coj-bold") for y in span: if x!=None and y!=None: - # print(span.text) + three.append(y.text) return three def structure_four(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') p=parser.find_all('p') four=[] for para in p: - # print(para) + span=para.find('span') if span!=None: - # print(span.text) + if "operative" in span.text.lower(): normal=span.find_all_next('table') for op in normal: @@ -85,22 +103,27 @@ def structure_four(self)->list: for subsequent in new_p: if subsequent!=None: - # print(subsequent.text) + four.append(subsequent.text) return four def structure_five(self)->list: + + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') p=parser.find_all('p') five=[] for para in p: - # print(para) + span=para.find('span') if span!=None: - # print(span.text) + if "operative" in span.text.lower(): normal=span.find_all_next('table') for op in normal: @@ -110,12 +133,17 @@ def structure_five(self)->list: for subsequent in new_p: if subsequent!=None: - # print(subsequent.text) + five.append(subsequent.text) return five def structure_six(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 + (header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. + """ + website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') div=parser.find_all('h2') @@ -125,35 +153,48 @@ def structure_six(self)->list: if h2.text=="Operative part": operatives=h2.find_all_next('p') for operative in operatives: - # print(operative.text) + six.append(operative.text) return six def structure_seven(self)->list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table + (table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') div=parser.find_all('table') seven=[] for divs in div: + # find tbody within the table table=divs.find_all('tbody') for tables in table: if tables!=None: + # find tr within the tbody p=tables.find_all('tr') for x in p: if x!=None: + # find td within the tr td=x.find_all('td') for y in td: if y!=None: p=y.find_all('p',class_="normal") for all in p: if all!=None: + # find operative part within the span span=all.find_all('span',class_="bold") for spans in span: - # print(spans.text) + # APpend it into a list and return the list when the function is called seven.append(spans.text) return seven def structure_eight(self)->list: + """ + This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside + the tbody tag.Returns a list as output. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') + tbody=parser.find_all('tbody') eight=[] for all in tbody: @@ -161,7 +202,7 @@ def structure_eight(self)->list: tr=all.find_all('tr') for trs in tr: if trs!=None: - # print(trs) + p=parser.find_all('p',class_="normal") for paras in p: @@ -172,10 +213,14 @@ def structure_eight(self)->list: for spans in span: if spans!=None: eight.append(spans.text) - # print(spans.text) + return eight def structure_nine(self)->list: + """ + This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) + tag after the p tag where the keywords "on those grounds" exist. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') nine=[] @@ -189,13 +234,17 @@ def structure_nine(self)->list: nine.append(bolds.text) return nine def structure_eleven(self)->list: + """ + This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) + tag after the b tag where the keywords "operative part" exist. + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') bold = parser.find_all('b') eleven=[] - # print(website) + for b in bold: if b!=None: @@ -204,19 +253,24 @@ def structure_eleven(self)->list: for tables in table: if tables!=None: eleven.append(tables.text) - # print(tables.text) + return eleven def structure_ten(self): + """ + This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s + server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase + "On those grounds". + """ website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text parser=BeautifulSoup(website,'lxml') appender=[] for string in parser.stripped_strings: - # print(string) + appender.append(string) - # print(appender) + found = False afterGrounds = [] @@ -225,46 +279,121 @@ def structure_ten(self): if "on those grounds" in x.lower(): found = True - # print("True") + if found: if len(x.split(" "))>3: afterGrounds.append(x) return afterGrounds def __call__(self)->list: + """ + This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . + """ + + container=[self.html_page_structure_one(),self.html_page_structure_two(),self.structure_three(),self.structure_four(),self.structure_five(), + self.structure_six(),self.structure_seven(),self.structure_eight(),self.structure_nine(),self.structure_ten(),self.structure_eleven()] + + + one:list - one=self.html_page_structure_one() - if len(one)==0 or len(one)=="\n": - one=self.html_page_structure_two() - if len(one)==0 or one[0]=="\n": - one=self.structure_three() - if len(one)==0 or one[0]=="\n": - one=self.structure_four() - if len(one)==0 or one[0]=="\n": - one=self.structure_five() - if len(one)==0 or one[0]=="\n": - one=self.structure_six() - if len(one)==0 or one[0]=="\n": - one=self.structure_seven() - if len(one)==0 or one[0]=="\n": - one=self.structure_eight() - if len(one)==0 or one[0]=="\n": - one=self.structure_nine() - if len(one)==0 or one[0]=="\n": - one=self.structure_ten() - if len(one)==0 or one[0]=="\n": - one=self.structure_eleven() - return one + for funcs in range(len(container)): + + one=container[funcs] + + if one: + if (len(one)!=0 or one[0]!="\n"): + print("here") + return one + + + + + + + # one=self.html_page_structure_one() + # if len(one)==0 or len(one)=="\n": + # one=self.html_page_structure_two() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_three() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_four() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_five() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_six() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_seven() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_eight() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_nine() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_ten() + # if len(one)==0 or one[0]=="\n": + # one=self.structure_eleven() + -instance=Analyzer("61980CJ0027") -x=instance() -if x!=None: - print(x) +# instance=Analyzer("61962CJ0026") +# x=instance() +# if x!=None: +# print(x) + + +class Writing(): + """ + This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) + """ + + instance:str + x:str + parameter:str + def __init__(self, celex:str): + self.celex = celex + self.instance = Analyzer(self.celex) + self.x = self.instance() + + + + def to_csv(self): + file=open("csv/output.csv","a+") + writer=csv.writer(file) + + if self.x!=None: + writer.writerow([self.celex,self.x]) + + def to_json(self): + if self.x!=None: + data={'Celex':self.celex,"Operative part":self.x} + file=open('json/data.json', 'a+') + json.dump(data,file) + file.close() + def to_txt(self): + + + if self.x!=None: + file=open(f"txt/{self.celex}.txt","a") + for w in self.x: + + file.write(w+"\n") + file.close() +#Sample code for reading celex id's froma tsv file +file=open("gijs_202310_node_list.tsv","r") +reader=csv.reader(file) +from output import Writing +testing=[] +for row in reader: + for rows in row: + if "Id" not in rows: + testing.append(rows.split("\t")[0]) +for all in testing: + instance=Writing(all) + instance.to_csv() + print(all) diff --git a/cellar/cellar_extractor/output.py b/cellar/cellar_extractor/output.py deleted file mode 100644 index 2d5e07e..0000000 --- a/cellar/cellar_extractor/output.py +++ /dev/null @@ -1,50 +0,0 @@ - -# from typing import Any -from operative_extractions import Analyzer -import csv -import json - -class Writing(): - - instance:str - x:str - def __init__(self, celex:str): - self.celex = celex - self.instance = Analyzer(self.celex) - self.x = self.instance() - - - def to_csv(self): - file=open("csv/output.csv","a+") - writer=csv.writer(file) - - if self.x!=None: - writer.writerow([self.celex,self.x]) - - def to_json(self): - if self.x!=None: - data={'Celex':self.celex,"Operative part":self.x} - file=open('json/data.json', 'a+') - json.dump(data,file) - file.close() - def to_txt(self): - - - if self.x!=None: - file=open(f"txt/{self.celex}.txt","a") - for w in self.x: - - file.write(w+"\n") - file.close() - def __call__(self): - self.to_csv() - # self.to_json() - # self.to_txt() - - - - - -# example=Writing("62018CA0390") -# example() - diff --git a/cellar/cellar_extractor/test_output.py b/cellar/cellar_extractor/test_output.py deleted file mode 100644 index 7e836bd..0000000 --- a/cellar/cellar_extractor/test_output.py +++ /dev/null @@ -1,14 +0,0 @@ -import csv -file=open("gijs_202310_node_list.tsv","r") -reader=csv.reader(file) -from output import Writing -testing=[] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) - -for all in testing: - instance=Writing(all) - instance() - # print(all) \ No newline at end of file From c42ca982a7cdcd43e32704be8c19a989e21fc78a Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:48:33 +0400 Subject: [PATCH 05/78] Update Testing_file.py --- cellar/cellar_extractor/Testing_file.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py index a8a0805..f743aee 100644 --- a/cellar/cellar_extractor/Testing_file.py +++ b/cellar/cellar_extractor/Testing_file.py @@ -61,19 +61,20 @@ def test_for_celex_id(self): randomized=random.randint(0,len(testing)-1) new_list.append(testing[randomized]) - -instance=Test(new_list) -instance.test_for_celex_id() + if __name__ == '__main__': celex = "62004CJ0292" + + instance=Test([celex]) + instance.test_for_celex_id() site = get_entire_page(celex) text = get_full_text_from_html(site) cits = get_citations_with_extra_info(text) print(cits) data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n') nodes_edges = get_nodes_and_edges_lists(data) - pass \ No newline at end of file + pass From 7cb83e4c55e0b2b1f3fedcec4b3292043c6d05ee Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:49:38 +0400 Subject: [PATCH 06/78] Delete cellar/cellar_extractor/operative_extraction.py --- .../cellar_extractor/operative_extraction.py | 399 ------------------ 1 file changed, 399 deletions(-) delete mode 100644 cellar/cellar_extractor/operative_extraction.py diff --git a/cellar/cellar_extractor/operative_extraction.py b/cellar/cellar_extractor/operative_extraction.py deleted file mode 100644 index 72f471c..0000000 --- a/cellar/cellar_extractor/operative_extraction.py +++ /dev/null @@ -1,399 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import unittest -from operative_extraction import Analyzer -import csv -import json -class Analyzer(): - """ - This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. - """ - celex:str # declare celex as a string - def __init__(self,celex):# Initialize Celex id as a constructor , passed when calling the class - self.celex=celex - - - def html_page_structure_one(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested - table structure . The relevant text lies inside the coj-bold class of the span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('table') # Find all tables tag from the website - one=[] - for divs in div: - table=divs.find('table') # Find each nested table within the table - if table!=None: - p=table.find_all('p',class_="coj-normal") # Find all p under the nested table with the coj-normal class - for x in p: - span=x.find_all('span',class_="coj-bold")# Span class of coj-bold under the p tag - for y in span: - if x!=None and y!=None: - - one.append(y.text)#append text from span onto a list - return one - - - def html_page_structure_two(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - two=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('p',class_="normal") - for op in normal: - - two.append(op.text) - return two - - def structure_three(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested - table structure . The relevant text lies inside the coj-bold class of the span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - table=parser.find_all('table') - three=[] - for tables in table: - interior=tables.find_all('table') - for interiors in interior: - if interiors!=None: - p=interiors.find_all('p',class_="coj-normal") - for x in p: - span=x.find_all('span',class_="coj-bold") - for y in span: - if x!=None and y!=None: - - three.append(y.text) - return three - - - - def structure_four(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - four=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('table') - for op in normal: - tbody=op.find('tbody') - new_p=tbody.find_all('p',class_="oj-normal") - - - for subsequent in new_p: - if subsequent!=None: - - four.append(subsequent.text) - - - return four - - def structure_five(self)->list: - - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - five=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('table') - for op in normal: - tbody=op.find('tbody') - new_p=tbody.find_all('p',class_="normal") - - - for subsequent in new_p: - if subsequent!=None: - - five.append(subsequent.text) - - - return five - def structure_six(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 - (header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. - """ - - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('h2') - six=[] - for h2 in div: - # print(h2.text) - if h2.text=="Operative part": - operatives=h2.find_all_next('p') - for operative in operatives: - - six.append(operative.text) - return six - def structure_seven(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table - (table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('table') - seven=[] - for divs in div: - # find tbody within the table - table=divs.find_all('tbody') - for tables in table: - if tables!=None: - # find tr within the tbody - p=tables.find_all('tr') - for x in p: - if x!=None: - # find td within the tr - td=x.find_all('td') - for y in td: - if y!=None: - p=y.find_all('p',class_="normal") - for all in p: - if all!=None: - # find operative part within the span - span=all.find_all('span',class_="bold") - for spans in span: - # APpend it into a list and return the list when the function is called - seven.append(spans.text) - return seven - def structure_eight(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside - the tbody tag.Returns a list as output. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - - tbody=parser.find_all('tbody') - eight=[] - for all in tbody: - if all!=None: - tr=all.find_all('tr') - for trs in tr: - if trs!=None: - - - p=parser.find_all('p',class_="normal") - for paras in p: - if paras!=None: - if "on those grounds" in paras.text.lower(): - - span=paras.find_all_next('span',class_="bold") - for spans in span: - if spans!=None: - eight.append(spans.text) - - - return eight - def structure_nine(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) - tag after the p tag where the keywords "on those grounds" exist. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - nine=[] - div=parser.find_all('p') - for divs in div: - if divs!=None: - if "on those grounds" in divs.text.lower(): - b=divs.find_all_next('b') - for bolds in b: - # print(bolds.text) - nine.append(bolds.text) - return nine - def structure_eleven(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) - tag after the b tag where the keywords "operative part" exist. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - bold = parser.find_all('b') - - eleven=[] - - - - for b in bold: - if b!=None: - if "operative part" in b.text.lower(): - table=b.find_all_next('p') - for tables in table: - if tables!=None: - eleven.append(tables.text) - - - - - return eleven - def structure_ten(self): - """ - This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s - server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase - "On those grounds". - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - appender=[] - for string in parser.stripped_strings: - - appender.append(string) - - - found = False - afterGrounds = [] - - for x in appender: - - if "on those grounds" in x.lower(): - found = True - - - if found: - if len(x.split(" "))>3: - afterGrounds.append(x) - return afterGrounds - def __call__(self)->list: - """ - This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . - """ - - container=[self.html_page_structure_one(),self.html_page_structure_two(),self.structure_three(),self.structure_four(),self.structure_five(), - self.structure_six(),self.structure_seven(),self.structure_eight(),self.structure_nine(),self.structure_ten(),self.structure_eleven()] - - - - one:list - for funcs in range(len(container)): - - one=container[funcs] - - if one: - if (len(one)!=0 or one[0]!="\n"): - print("here") - return one - - - - - - - # one=self.html_page_structure_one() - # if len(one)==0 or len(one)=="\n": - # one=self.html_page_structure_two() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_three() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_four() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_five() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_six() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_seven() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_eight() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_nine() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_ten() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_eleven() - - - - - - - -# instance=Analyzer("61962CJ0026") -# x=instance() -# if x!=None: -# print(x) - - -class Writing(): - """ - This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) - """ - - instance:str - x:str - parameter:str - def __init__(self, celex:str): - self.celex = celex - self.instance = Analyzer(self.celex) - self.x = self.instance() - - - - def to_csv(self): - file=open("csv/output.csv","a+") - writer=csv.writer(file) - - if self.x!=None: - writer.writerow([self.celex,self.x]) - - def to_json(self): - if self.x!=None: - data={'Celex':self.celex,"Operative part":self.x} - file=open('json/data.json', 'a+') - json.dump(data,file) - file.close() - def to_txt(self): - - - if self.x!=None: - file=open(f"txt/{self.celex}.txt","a") - for w in self.x: - - file.write(w+"\n") - file.close() -#Sample code for reading celex id's froma tsv file - -file=open("gijs_202310_node_list.tsv","r") -reader=csv.reader(file) -from output import Writing -testing=[] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) -for all in testing: - instance=Writing(all) - instance.to_csv() - print(all) - - From d5e1aa376b6cf4762b5919ddf3d5a34b6edd5291 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:49:55 +0400 Subject: [PATCH 07/78] Add files via upload --- .../cellar_extractor/operative_extractions.py | 367 ++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 cellar/cellar_extractor/operative_extractions.py diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py new file mode 100644 index 0000000..deb0b22 --- /dev/null +++ b/cellar/cellar_extractor/operative_extractions.py @@ -0,0 +1,367 @@ + +import requests +from bs4 import BeautifulSoup + +import csv +import json + + +class Analyzer(): + """ + This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. + """ + celex: str # declare celex as a string + + def __init__(self, celex): # Initialize Celex id as a constructor , passed when calling the class + self.celex = celex + + def html_page_structure_one(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested + table structure . The relevant text lies inside the coj-bold class of the span tag. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('table') # Find all tables tag from the website + one = [] + for divs in div: + # Find each nested table within the table + table = divs.find('table') + if table != None: + # Find all p under the nested table with the coj-normal class + p = table.find_all('p', class_="coj-normal") + for x in p: + # Span class of coj-bold under the p tag + span = x.find_all('span', class_="coj-bold") + for y in span: + if x != None and y != None: + + # append text from span onto a list + one.append(y.text) + return one + + def html_page_structure_two(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + two = [] + for para in p: + + span = para.find('span') + if span != None: + + if "operative" in span.text.lower(): + normal = span.find_all_next('p', class_="normal") + for op in normal: + + two.append(op.text) + return two + + def structure_three(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested + table structure . The relevant text lies inside the coj-bold class of the span tag. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + table = parser.find_all('table') + three = [] + for tables in table: + interior = tables.find_all('table') + for interiors in interior: + if interiors != None: + p = interiors.find_all('p', class_="coj-normal") + for x in p: + span = x.find_all('span', class_="coj-bold") + for y in span: + if x != None and y != None: + + three.append(y.text) + return three + + def structure_four(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + four = [] + for para in p: + + span = para.find('span') + if span != None: + + if "operative" in span.text.lower(): + normal = span.find_all_next('table') + for op in normal: + tbody = op.find('tbody') + new_p = tbody.find_all('p', class_="oj-normal") + + for subsequent in new_p: + if subsequent != None: + + four.append(subsequent.text) + + return four + + def structure_five(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + five = [] + for para in p: + + span = para.find('span') + if span != None: + + if "operative" in span.text.lower(): + normal = span.find_all_next('table') + for op in normal: + tbody = op.find('tbody') + new_p = tbody.find_all('p', class_="normal") + + for subsequent in new_p: + if subsequent != None: + + five.append(subsequent.text) + + return five + + def structure_six(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 + (header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. + """ + + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('h2') + six = [] + for h2 in div: + # print(h2.text) + if h2.text == "Operative part": + operatives = h2.find_all_next('p') + for operative in operatives: + + six.append(operative.text) + return six + + def structure_seven(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table + (table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('table') + seven = [] + for divs in div: + # find tbody within the table + table = divs.find_all('tbody') + for tables in table: + if tables != None: + # find tr within the tbody + p = tables.find_all('tr') + for x in p: + if x != None: + # find td within the tr + td = x.find_all('td') + for y in td: + if y != None: + p = y.find_all('p', class_="normal") + for all in p: + if all != None: + # find operative part within the span + span = all.find_all( + 'span', class_="bold") + for spans in span: + # APpend it into a list and return the list when the function is called + seven.append(spans.text) + return seven + + def structure_eight(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside + the tbody tag.Returns a list as output. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + + tbody = parser.find_all('tbody') + eight = [] + for all in tbody: + if all != None: + tr = all.find_all('tr') + for trs in tr: + if trs != None: + + p = parser.find_all('p', class_="normal") + for paras in p: + if paras != None: + if "on those grounds" in paras.text.lower(): + + span = paras.find_all_next( + 'span', class_="bold") + for spans in span: + if spans != None: + eight.append(spans.text) + + return eight + + def structure_nine(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) + tag after the p tag where the keywords "on those grounds" exist. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + nine = [] + div = parser.find_all('p') + for divs in div: + if divs != None: + if "on those grounds" in divs.text.lower(): + b = divs.find_all_next('b') + for bolds in b: + # print(bolds.text) + nine.append(bolds.text) + return nine + + def structure_eleven(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) + tag after the b tag where the keywords "operative part" exist. + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + bold = parser.find_all('b') + + eleven = [] + + for b in bold: + if b != None: + if "operative part" in b.text.lower(): + table = b.find_all_next('p') + for tables in table: + if tables != None: + eleven.append(tables.text) + + return eleven + + def structure_ten(self): + """ + This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s + server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase + "On those grounds". + """ + website = requests.get( + f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + parser = BeautifulSoup(website, 'lxml') + appender = [] + for string in parser.stripped_strings: + + appender.append(string) + + found = False + afterGrounds = [] + + for x in appender: + + if "on those grounds" in x.lower(): + found = True + + if found: + if len(x.split(" ")) > 3: + afterGrounds.append(x) + return afterGrounds + + def __call__(self) -> list: + """ + This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . + """ + + container = [self.html_page_structure_one(), self.html_page_structure_two(), self.structure_three(), self.structure_four(), self.structure_five(), + self.structure_six(), self.structure_seven(), self.structure_eight(), self.structure_nine(), self.structure_ten(), self.structure_eleven()] + + one: list + for funcs in range(len(container)): + + one = container[funcs] + + if one: + if (len(one) != 0 or one[0] != "\n"): + print("here") + return one + + + + +class Writing(): + """ + This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) + """ + + instance: str + x: str + parameter: str + + def __init__(self, celex: str): + self.celex = celex + self.instance = Analyzer(self.celex) + self.x = self.instance() + + def to_csv(self): + file = open("csv/output.csv", "a+") + writer = csv.writer(file) + + if self.x != None: + writer.writerow([self.celex, self.x]) + + def to_json(self): + if self.x != None: + data = {'Celex': self.celex, "Operative part": self.x} + file = open('json/data.json', 'a+') + json.dump(data, file) + file.close() + + def to_txt(self): + + if self.x != None: + file = open(f"txt/{self.celex}.txt", "a") + for w in self.x: + + file.write(w+"\n") + file.close() +# Sample code for reading celex id's froma tsv file + + +file = open("gijs_202310_node_list.tsv", "r") +reader = csv.reader(file) +testing = [] +for row in reader: + for rows in row: + if "Id" not in rows: + testing.append(rows.split("\t")[0]) +for all in testing: + instance = Writing(all) + instance.to_csv() + print(all) From 8d754a774273cc96a871962c1386d73276775195 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:51:18 +0400 Subject: [PATCH 08/78] Delete cellar/cellar_extractor/Testing_file.py --- cellar/cellar_extractor/Testing_file.py | 80 ------------------------- 1 file changed, 80 deletions(-) delete mode 100644 cellar/cellar_extractor/Testing_file.py diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py deleted file mode 100644 index f743aee..0000000 --- a/cellar/cellar_extractor/Testing_file.py +++ /dev/null @@ -1,80 +0,0 @@ -""" - -This file is purely a testing file for trying out separate parts of code, testing if everything works and such. -Can be also used to develop future code. - - - -""" - -from nodes_and_edges import get_nodes_and_edges -from os.path import join -from json_to_csv import read_csv -import time -from eurlex_scraping import * -from cellar import * -from sparql import * -import unittest -from operative_extraction import Analyzer -# from test import testing -import random -import csv -file=open("gijs_202310_node_list.tsv","r") -reader=csv.reader(file) -no_of_test_cases=30 -testing=[] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) -class Test(unittest.TestCase): - """ - class for unittesing operative part , it checks whether the list returns null value or has some value. - """ - ids:list - def __init__(self,ids): - self.ids=ids - - def test_for_celex_id(self): - """ - Main function which runs the unittest Testcase . - """ - count_fail:int - count_pass=0 - for id in self.ids: - test_output=Analyzer(id) - test_instance=test_output() - - # self.assertFalse(len(test_instance)<=1) - - try: - self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") - count_pass+=1 - print(f"{id} ---> PASSED.") - except: - print(f"{id} ---> FAILED.") - print(f"Passed {count_pass}/{len(self.ids)} times") - # print(len(self.ids)-count,"were passed successfully") - -new_list=[] -for w in range(no_of_test_cases): - randomized=random.randint(0,len(testing)-1) - new_list.append(testing[randomized]) - - - - - - -if __name__ == '__main__': - celex = "62004CJ0292" - - instance=Test([celex]) - instance.test_for_celex_id() - site = get_entire_page(celex) - text = get_full_text_from_html(site) - cits = get_citations_with_extra_info(text) - print(cits) - data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n') - nodes_edges = get_nodes_and_edges_lists(data) - pass From a6fd5b8a09092003c57ac4b1af5034c603f7e3e0 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:51:31 +0400 Subject: [PATCH 09/78] Add files via upload --- cellar/cellar_extractor/Testing_file.py | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 cellar/cellar_extractor/Testing_file.py diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py new file mode 100644 index 0000000..9418533 --- /dev/null +++ b/cellar/cellar_extractor/Testing_file.py @@ -0,0 +1,80 @@ +""" + +This file is purely a testing file for trying out separate parts of code, testing if everything works and such. +Can be also used to develop future code. + + + +""" + +from nodes_and_edges import get_nodes_and_edges +from os.path import join +from json_to_csv import read_csv +import time +from eurlex_scraping import * +from cellar import * +from sparql import * +import unittest +from operative_extraction import Analyzer +# from test import testing +import random +import csv +file=open("gijs_202310_node_list.tsv","r") +reader=csv.reader(file) +no_of_test_cases=30 +testing=[] +for row in reader: + for rows in row: + if "Id" not in rows: + testing.append(rows.split("\t")[0]) +class Test(unittest.TestCase): + """ + class for unittesing operative part , it checks whether the list returns null value or has some value. + """ + ids:list + def __init__(self,ids): + self.ids=ids + + def test_for_celex_id(self): + """ + Main function which runs the unittest Testcase . + """ + count_fail:int + count_pass=0 + for id in self.ids: + test_output=Analyzer(id) + test_instance=test_output() + + # self.assertFalse(len(test_instance)<=1) + + try: + self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") + count_pass+=1 + print(f"{id} ---> PASSED.") + except: + print(f"{id} ---> FAILED.") + print(f"Passed {count_pass}/{len(self.ids)} times") + # print(len(self.ids)-count,"were passed successfully") + +new_list=[] +for w in range(no_of_test_cases): + randomized=random.randint(0,len(testing)-1) + new_list.append(testing[randomized]) + + + + + + +if __name__ == '__main__': + celex = "62004CJ0292" + + instance=Test([celex]) + instance.test_for_celex_id() + site = get_entire_page(celex) + text = get_full_text_from_html(site) + cits = get_citations_with_extra_info(text) + print(cits) + data,d2 = get_cellar_extra(sd='2023-01-01',max_ecli=100,save_file='n') + nodes_edges = get_nodes_and_edges_lists(data) + pass \ No newline at end of file From 3664a9c249033bdd0dd65907220c2e47c70b51b1 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:35:08 +0100 Subject: [PATCH 10/78] Added doc string to method extra_cellar --- .../cellar_extractor/cellar_extra_extract.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/cellar/cellar_extractor/cellar_extra_extract.py b/cellar/cellar_extractor/cellar_extra_extract.py index 4b6c0eb..81c2764 100644 --- a/cellar/cellar_extractor/cellar_extra_extract.py +++ b/cellar/cellar_extractor/cellar_extra_extract.py @@ -4,17 +4,44 @@ def extra_cellar(data=None, filepath=None, threads=10, username="", password=""): + """ + Extracts information from a cellar dataset. + + Args: + data (pandas.DataFrame, optional): The input dataset. If not provided, + it will be read from the specified filepath. + filepath (str, optional): The path to the input dataset file. If provided, + the data will be read from this file. + threads (int, optional): The number of threads to use for parallel + processing. Default is 10. + username (str, optional): The username for accessing a separate + webservice. Default is an empty string. + password (str, optional): The password for accessing a separate + webservice. Default is an empty string. + + Returns: + tuple: A tuple containing the modified dataset and a JSON object. + + If `data` is not provided, the dataset will be read from the specified + `filepath`. + + If `username` and `password` are provided, the function will add + citations using a separate webservice. + + The function will add sections to the dataset using the specified + number of `threads`. If `filepath` is provided, + the modified dataset will be saved to the same file. Otherwise, the + modified dataset and a JSON object will be returned. + """ if data is None: data = read_csv(filepath) if filepath: if username !="" and password !="": add_citations_separate_webservice(data, username, password) - #print("Citations successfully added. The rest of additional extraction will now happen.") add_sections(data, threads, filepath.replace(".csv", "_fulltext.json")) data.to_csv(filepath, index=False) else: if username != "" and password != "": add_citations_separate_webservice(data, username, password) - #print("Citations successfully added. The rest of additional extraction will now happen.") json = add_sections(data, threads) return data, json From 16af4bbd0aff80f80fc1f5eab094d065ccb5db70 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:36:05 +0100 Subject: [PATCH 11/78] Added doc string and linted code for cellar_queries file --- cellar/cellar_extractor/cellar_queries.py | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cellar/cellar_extractor/cellar_queries.py b/cellar/cellar_extractor/cellar_queries.py index 6c74d16..b0f9e95 100644 --- a/cellar/cellar_extractor/cellar_queries.py +++ b/cellar/cellar_extractor/cellar_queries.py @@ -48,18 +48,23 @@ def get_all_eclis(starting_date=None, ending_date=None): return eclis -def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, force_readable_vals=False): +def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, + force_readable_vals=False): """Gets cellar metadata :param eclis: The ECLIs for which to retrieve metadata :type eclis: list[str] - :param get_labels: Flag to get human-readable labels for the properties, defaults to True + :param get_labels: Flag to get human-readable labels for the properties, + defaults to True :type get_labels: bool, optional - :param force_readable_cols: Flag to remove any non-labelled properties from the resulting dict, defaults to True + :param force_readable_cols: Flag to remove any non-labelled properties + from the resulting dict, defaults to True :type force_readable_cols: bool, optional - :param force_readable_vals: Flag to remove any non-labelled values from the resulting dict, defaults to False + :param force_readable_vals: Flag to remove any non-labelled values from + the resulting dict, defaults to False :type force_readable_vals: bool, optional - :return: Dictionary containing metadata. Top-level keys are ECLIs, second level are property names + :return: Dictionary containing metadata. Top-level keys are ECLIs, second + level are property names :rtype: Dict[str, Dict[str, list[str]]] """ @@ -100,8 +105,8 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo for ecli in eclis: metadata[ecli] = {} - # Take each triple, check which source doc it belongs to, key/value pair into its dict derived from the p and o in - # the query + # Take each triple, check which source doc it belongs to, key/value pair + # into its dict derived from the p and o in the query for res in ret['results']['bindings']: ecli = res['ecli']['value'] # We only want cdm predicates @@ -125,8 +130,9 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo else: val = res['o']['value'] - # We store the values for each property in a list. For some properties this is not necessary, - # but if a property can be assigned multiple times, this is important. Notable, for example is citations.b + # We store the values for each property in a list. For some properties + # this is not necessary, but if a property can be assigned multiple + # times, this is important. Notable, for example is citations. if key in metadata[ecli]: metadata[ecli][key].append(val) else: From 4e07a793326c380785a7d2873f95a3cf9087b72f Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:38:07 +0100 Subject: [PATCH 12/78] Linted code, added encoding for opening files, and removed unused libraries from cellar.py file --- cellar/cellar_extractor/cellar.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index 24c4a67..354cd5f 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -1,14 +1,15 @@ import json import os -from os.path import join + from datetime import datetime from pathlib import Path + +import time from tqdm import tqdm from cellar_extractor.cellar_queries import get_all_eclis, get_raw_cellar_metadata from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning from cellar_extractor.cellar_extra_extract import extra_cellar from cellar_extractor.nodes_and_edges import get_nodes_and_edges -import time def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'): if not ed: @@ -38,18 +39,18 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma json_to_csv_main(all_eclis, file_path) else: file_path = os.path.join('data', file_name + '.json') - with open(file_path, "w") as f: + with open(file_path, "w", encoding="utf-8") as f: json.dump(all_eclis, f) else: if file_format == 'csv': df = json_to_csv_returning(all_eclis) return df - else: - return all_eclis + return all_eclis print("\n--- DONE ---") -def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""): +def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", + threads=10, username="", password=""): if not ed: ed = datetime.now().isoformat(timespec='seconds') data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv') @@ -62,14 +63,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre file_path = os.path.join('data', file_name + '.csv') if save_file == 'y': Path('data').mkdir(parents=True, exist_ok=True) - extra_cellar(data = data ,filepath=file_path, threads=threads, username=username, password=password) + extra_cellar(data = data ,filepath=file_path, threads=threads, + username=username, password=password) print("\n--- DONE ---") else: - data,json = extra_cellar(data= data, threads = threads, username= username,password=password) + data,json_data = extra_cellar(data= data, threads = threads, + username= username,password=password) print("\n--- DONE ---") - return data,json + return data,json_data def get_nodes_and_edges_lists(df = None): if df is None: From 52436730ca4803bad99f233757cdd674538edf69 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:39:16 +0100 Subject: [PATCH 13/78] Linted code and moved doc strings under methods rather than above for citations_adder.py --- cellar/cellar_extractor/citations_adder.py | 168 ++++++++++----------- 1 file changed, 77 insertions(+), 91 deletions(-) diff --git a/cellar/cellar_extractor/citations_adder.py b/cellar/cellar_extractor/citations_adder.py index 99de07d..bc49bb7 100644 --- a/cellar/cellar_extractor/citations_adder.py +++ b/cellar/cellar_extractor/citations_adder.py @@ -4,39 +4,39 @@ from io import StringIO from os.path import dirname, abspath import pandas as pd -from cellar_extractor.sparql import get_citations_csv, get_cited, get_citing, run_eurlex_webservice_query +from cellar_extractor.sparql import (get_citations_csv, get_cited, + get_citing, run_eurlex_webservice_query) from cellar_extractor.eurlex_scraping import extract_dictionary_from_webservice_query from tqdm import tqdm sys.path.append(dirname(dirname(dirname(dirname(abspath(__file__)))))) -""" -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Sends a query which returns a csv file containing the the celex identifiers of cited works for each case. -Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query. -""" - def execute_citations(csv_list, citations): + """ + Method used by separate threads for the multi-threading method of adding + citations to the dataframe. Sends a query which returns a csv file + containing the the celex identifiers of cited works for each case. Works + with multi-case queries, at_once is the variable deciding for how many + cases are used with each query. + """ at_once = 1000 for i in range(0, len(citations), at_once): new_csv = get_citations_csv(citations[i:(i + at_once)]) csv_list.append(StringIO(new_csv)) -""" -This method replaces replaces the column with citations. - -Old column -> links to cited works -New column -> celex identifiers of cited works - -It uses multithreading, which is very much recommended. -Uses a query to get the citations in a csv format from the endpoint. * +def add_citations(data, threads): + """ + This method replaces replaces the column with citations. -* More details in the query method. -""" + Old column -> links to cited works + New column -> celex identifiers of cited works + It uses multithreading, which is very much recommended. + Uses a query to get the citations in a csv format from the endpoint. * -def add_citations(data, threads): + * More details in the query method. + """ name = "WORK CITES WORK. CI / CJ" celex = data.loc[:, "CELEX IDENTIFIER"] @@ -67,15 +67,14 @@ def add_citations(data, threads): citations.sort_index(inplace=True) data.insert(1, name, citations) - -""" -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Sends a query which returns a csv file containing the the celex identifiers of cited works for each case. -Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query. -""" - - def execute_citations_separate(cited_list, citing_list, citations): + """ + Method used by separate threads for the multi-threading method of + adding citations to the dataframe. Sends a query which returns a csv + file containing the the celex identifiers of cited works for each case. + Works with multi-case queries, at_once is the variable deciding for + how many cases are used with each query. + """ at_once = 1000 for i in range(0, len(citations), at_once): new_cited = get_cited(citations[i:(i + at_once)], 1) @@ -83,17 +82,12 @@ def execute_citations_separate(cited_list, citing_list, citations): cited_list.append(StringIO(new_cited)) citing_list.append(StringIO(new_citing)) - -""" - -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Uses the eurlex webservices. -Also used for the single-thread approach. - -""" - - def execute_citations_webservice(dictionary_list, celexes, username, password): + """ + Method used by separate threads for the multi-threading method of + adding citations to the dataframe. Uses the eurlex webservices. + Also used for the single-thread approach. + """ at_once = 100 success=0 retry=0 @@ -102,7 +96,8 @@ def execute_citations_webservice(dictionary_list, celexes, username, password): normal_celex, contains_celex = clean_celex(celexes) def process_queries(link, celex): nonlocal success,retry - for i in tqdm(range(0, len(celex), at_once), colour="GREEN", position=0, leave=True, maxinterval=10000): + for i in tqdm(range(0, len(celex), at_once), colour="GREEN", + position=0, leave=True, maxinterval=10000): curr_celex = celex[i:(i + at_once)] input=" OR ".join(curr_celex) query = link % (str(input)) @@ -111,7 +106,8 @@ def process_queries(link, celex): response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 and "WS_WS_CALLS_IDLE_INTERVAL" not in response.text: perc=i*100/len(celexes) - print(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." + print(f"Limit of web service usage reached! Citations collection\ + will stop here at {perc} % of citations downloaded." f"\nThere were {success} successful queries and {retry} retries") return elif "0" in response.text: @@ -133,16 +129,13 @@ def process_queries(link, celex): process_queries(base_contains_query,contains_celex) -""" - -Method used to separate celex id's when there are multiple pointing to the same document. -On top of that, separates celex id's with '(' and ')', these brackets are keywords for the webservice query. -After separated, a different query is ran for the normal celexes, and those with brackets. - -""" - - def clean_celex(celex): + """ + Method used to separate celex id's when there are multiple pointing to the same document. + On top of that, separates celex id's with '(' and ')', these brackets are keywords for the + webservice query. After separated, a different query is ran for the normal celexes, and + those with brackets. + """ normal_list = list() contains_list = list() for c1 in celex: @@ -162,16 +155,14 @@ def clean_celex(celex): normal_list.append(c1) return normal_list, contains_list - -""" - -Method used for creation of a dictionary of documents citing the document. -Uses the dictionary of documents cited by the document. -Output will more than likely be bigger than the input dictionary, as it will also include treaties and other documents, -which are not being extracted by the cellar extractor. - -""" def allowed_id(id): + """ + Method used for creation of a dictionary of documents citing the document. + Uses the dictionary of documents cited by the document. + Output will more than likely be bigger than the input dictionary, + as it will also include treaties and other documents, + which are not being extracted by the cellar extractor. + """ if id != "": return id[0] == 8 or id[0] == 6 else: @@ -189,17 +180,13 @@ def reverse_citing_dict(citing): cited[c] = k return cited - -""" - -Method used to add the dictionaries to the dataframe. -Used by the citations adding from the eurlex webservices. -Implements checks, for whether the document whose data we want to add exists in the original dataframe. - -""" - - def add_dictionary_to_df(df, dictionary, column_title): + """ + Method used to add the dictionaries to the dataframe. + Used by the citations adding from the eurlex webservices. + Implements checks, for whether the document whose data we want to add + exists in the original dataframe. + """ column = pd.Series([], dtype='string') celex = df.loc[:, "CELEX IDENTIFIER"] for k in dictionary: @@ -210,29 +197,28 @@ def add_dictionary_to_df(df, dictionary, column_title): df.insert(1, column_title, column) -""" -Main method for citations adding via eurlex webservices. - -Old column -> links to cited works -New columns -> celex identifiers of cited works and works citing current work - -""" - - def add_citations_separate_webservice(data, username, password): + """ + Main method for citations adding via eurlex webservices. + Old column -> links to cited works + New columns -> celex identifiers of cited works and works citing current work + """ celex = data.loc[:, "CELEX IDENTIFIER"] query = " SELECT CI, DN WHERE DN = 62019CJ0668" response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 : if "WS_MAXIMUM_NB_OF_WS_CALLS" in response.text: - print("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.") + print("Maximum number of calls to the eurlex webservices reached!\ + The code will skip the citations download.") return else: - print("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " + print("Incorrect username and password for eurlex webservices!\ + (The account login credentials and webservice) " "login credentials are different)") sys.exit(2) elif response.status_code == 403: - print("Webservice connection was blocked, eurlex might be going through maintenance right now.") + print("Webservice connection was blocked, eurlex might be going\ + through maintenance right now.") sys.exit(2) else: print("Webservice connection was successful!") @@ -243,27 +229,26 @@ def add_citations_separate_webservice(data, username, password): for d in dictionary_list: citing_dict.update(d) print("Webservice extraction finished, the rest of extraction will now happen.") - time.sleep(1) # It seemed to print out the length of dictionary wrong, even when it was equal to 1000. + time.sleep(1) # It seemed to print out the length of dictionary wrong, + # even when it was equal to 1000. cited_dict = reverse_citing_dict(citing_dict) add_dictionary_to_df(data, citing_dict, "citing") add_dictionary_to_df(data, cited_dict, "cited_by") +def add_citations_separate(data, threads): + """ + This method replaces replaces the column with citations. -""" -This method replaces replaces the column with citations. - -Old column -> links to cited works -New column -> celex identifiers of cited works - -It uses multithreading, which is very much recommended. -Uses a query to get the citations in a csv format from the endpoint. * + Old column -> links to cited works + New column -> celex identifiers of cited works -* More details in the query method. -""" + It uses multithreading, which is very much recommended. + Uses a query to get the citations in a csv format from the endpoint. * + * More details in the query method. + """ -def add_citations_separate(data, threads): celex = data.loc[:, "CELEX IDENTIFIER"] length = celex.size if length > 100: # to avoid getting problems with small files @@ -276,7 +261,8 @@ def add_citations_separate(data, threads): for i in range(0, length, at_once_threads): curr_celex = celex[i:(i + at_once_threads)] - t = threading.Thread(target=execute_citations_separate, args=(cited_csv, citing_csv, curr_celex)) + t = threading.Thread(target=execute_citations_separate, + args=(cited_csv, citing_csv, curr_celex)) threads.append(t) for t in threads: From 7db0f6adb680f2420c6921aa3625b067dd0b83cc Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:40:06 +0100 Subject: [PATCH 14/78] Linted code for csv_extractor.py --- cellar/cellar_extractor/csv_extractor.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py index 23ee71c..e4612ad 100644 --- a/cellar/cellar_extractor/csv_extractor.py +++ b/cellar/cellar_extractor/csv_extractor.py @@ -2,12 +2,11 @@ import argparse from cellar_extractor.json_to_csv import read_csv -""" -Method takes in a dataframe and returns a dataframe with only *number* of data rows. -""" - - def extract_rows(data, number): + """ + Method takes in a dataframe and returns a dataframe with only *number* of data rows. + """ + try: output = data[1:number] except Exception: @@ -24,7 +23,7 @@ def extract_rows(data, number): print("") print("EXTRACTION FROM CSV FILE IN DATA PROCESSED DIR STARTED") print("") - csv_files = (glob.glob(DIR_DATA_RAW + "/" + "*.csv")) + csv_files = glob.glob(DIR_DATA_RAW + "/" + "*.csv") print(f"FOUND {len(csv_files)} CSV FILES") for i in range(len(csv_files)): From 2ddf0a64b7b3346b00bcc581ee3427341c6f6f12 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:41:37 +0100 Subject: [PATCH 15/78] Linted code, changed conditional statements for PEP8 conformity, and changed variable names that were similar to inbuilt ones for eurlex_scraping --- cellar/cellar_extractor/eurlex_scraping.py | 231 ++++++++------------- 1 file changed, 86 insertions(+), 145 deletions(-) diff --git a/cellar/cellar_extractor/eurlex_scraping.py b/cellar/cellar_extractor/eurlex_scraping.py index 944097e..d825ccd 100644 --- a/cellar/cellar_extractor/eurlex_scraping.py +++ b/cellar/cellar_extractor/eurlex_scraping.py @@ -1,8 +1,9 @@ -from bs4 import BeautifulSoup -import requests import time -import xmltodict import re +import requests +import xmltodict + +from bs4 import BeautifulSoup LINK_SUMMARY_INF = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN' LINK_SUMJURE = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere_SUM&from=EN' @@ -17,31 +18,25 @@ def is_code(word): return word.replace(".", "0").replace("-", "0")[1:].isdigit() - -""" -Wrapped method for requests.get(). -After 10 retries, it gives up and returns a "404" string. -""" - - def response_wrapper(link, num=1): + """ + Wrapped method for requests.get(). + After 10 retries, it gives up and returns a "404" string. + """ if num == 10: return "404" try: - response = requests.get(link) + response = requests.get(link, timeout=60) return response except Exception: time.sleep(0.5 * num) return response_wrapper(link, num + 1) - -""" -This method returns the html of a summary page. -Cellar specific, works for celex id's starting a 6 and 8. -""" - - def get_summary_html(celex): + """ + This method returns the html of a summary page. + Cellar specific, works for celex id's starting a 6 and 8. + """ if celex == celex: # nan check if ";" in celex: idss = celex.split(";") @@ -82,18 +77,16 @@ def get_summary_html(celex): else: return "No summary available" - -""" -Method used to extract the summary from a html page. -Cellar specific, uses get_words_from_keywords. -Currently only walking for celex id's starting with a 6 ( EU cases). -""" - - def get_summary_from_html(html, starting): + """ + Method used to extract the summary from a html page. + Cellar specific, uses get_words_from_keywords. + Currently only walking for celex id's starting with a 6 ( EU cases). + # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction + """ text = get_full_text_from_html(html) if starting == "8": return "No summary available" @@ -107,17 +100,14 @@ def get_summary_from_html(html, starting): return text return text - -""" -Method used to extract the keywords from a html page. -Cellar specific, uses get_words_from_keywords. -""" - - def get_keywords_from_html(html, starting): + """ + Method used to extract the keywords from a html page. + Cellar specific, uses get_words_from_keywords. # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction + """ text = get_full_text_from_html(html) if starting == "8": text = "No keywords available" @@ -125,22 +115,18 @@ def get_keywords_from_html(html, starting): elif starting == "6": return get_words_from_keywords(text) - -""" - -Method used for citations extraction from eurlex webservices. -It reads the SOAP response from the webservices, and adds values to the dictionary based on the results. -Dictionary is using the celex id of a work as key and a list of celex id's of works cited as value. - -""" - - def extract_dictionary_from_webservice_query(response): + """ + Method used for citations extraction from eurlex webservices. + It reads the SOAP response from the webservices, and adds values to the + dictionary based on the results. Dictionary is using the celex id of a + work as key and a list of celex id's of works cited as value. + """ text = response.text read = xmltodict.parse(text) results = read['S:Envelope']['S:Body']['searchResults']['result'] dictionary = dict() - if type(results) == list: + if isinstance(results, list): for result in results: celex, citing = extract_citations_from_soap(result) dictionary[celex] = citing @@ -149,16 +135,11 @@ def extract_dictionary_from_webservice_query(response): dictionary[celex] = citing return dictionary - -""" - -Method used for citations extraction from eurlex webservices. -Reads the individual celex id and documents cited from a single result. - -""" - - def extract_citations_from_soap(results): + """ + Method used for citations extraction from eurlex webservices. + Reads the individual celex id and documents cited from a single result. + """ main_content = results['content']['NOTICE']['WORK'] celex = main_content['ID_CELEX'].get('VALUE') try: @@ -166,7 +147,7 @@ def extract_citations_from_soap(results): except KeyError: return celex, "" citing_list = list() - if type(citing) == list: + if isinstance(citing, list): for cited in citing: celex_of_citation = get_citation_celex(cited) if celex_of_citation != "": @@ -175,20 +156,16 @@ def extract_citations_from_soap(results): else: return celex, get_citation_celex(citing) - -""" - -Method used for citations extraction from eurlex webservices. -Goes thru all of the different id's of the document cited, and returns the one that is a celex id. - -""" - - def get_citation_celex(cited): + """ + Method used for citations extraction from eurlex webservices. + Goes thru all of the different id's of the document cited, + and returns the one that is a celex id. + """ identifiers = cited['SAMEAS'] - if type(identifiers) == list: - for id in identifiers: - ident = id['URI']['IDENTIFIER'] + if isinstance(identifiers, list): + for _id in identifiers: + ident = _id['URI']['IDENTIFIER'] if is_celex_id(ident): return ident else: @@ -197,31 +174,22 @@ def get_citation_celex(cited): return ident return "" - -""" - -Method checking if the id passed is a celex id, using regex. - -""" - - -def is_celex_id(id): - if id is None: +def is_celex_id(_id): + """ + Method checking if the id passed is a celex id, using regex. + """ + if _id is None: return False - if prog.match(id): + if prog.match(_id): return True else: return False - -""" -This method tries to extract only they keywords from a part of html page containing it. -They keywords on the page are always separated by " - " or other types of dashes. - -""" - - def get_words_from_keywords_em(text): + """ + This method tries to extract only they keywords from a part of html page containing it. + They keywords on the page are always separated by " - " or other types of dashes. + """ lines = text.split(sep="\n") returner = set() for line in lines: @@ -242,15 +210,10 @@ def get_words_from_keywords_em(text): returner.update(line.split(sep=" - ")) return ";".join(returner) - -""" - -One of the methods used to extract keywords from summary text. - -""" - - def get_words_from_keywords(text): + """ + One of the methods used to extract keywords from summary text. + """ if "Keywords" in text: try: index = text.find("Keywords") @@ -269,15 +232,12 @@ def get_words_from_keywords(text): text = text[:index] return get_words_from_keywords_em(text) - -""" - This method turns the html code from the summary page into text. - It has different cases depending on the first character of the CELEX ID. - Universal method, also replaces all "," with "_". -""" - - def get_full_text_from_html(html_text): + """ + This method turns the html code from the summary page into text. + It has different cases depending on the first character of the CELEX ID. + Universal method, also replaces all "," with "_". + """ # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction @@ -294,15 +254,12 @@ def get_full_text_from_html(html_text): text = text.replace(",", "_") return text - -""" -This method is a wrapped for the get_html_by_celex_id method imported from eurlex. -Sometimes thew websites do not load because of too many connections at once, -this method waits a bit and tries again for up to 5 tries. -""" - - def get_html_text_by_celex_id(id): + """ + This method is a wrapped for the get_html_by_celex_id method imported from eurlex. + Sometimes thew websites do not load because of too many connections at once, + this method waits a bit and tries again for up to 5 tries. + """ link = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN" final = id if id == id: # nan check @@ -321,14 +278,12 @@ def get_html_text_by_celex_id(id): else: return html.text - -""" -This method gets the page containing all document details for extracting the subject matter and -the case law directory codes. Uses the celex identifier of a case. -""" - - def get_entire_page(celex): + """ + This method gets the page containing all document details for extracting + the subject matter and + the case law directory codes. Uses the celex identifier of a case. + """ link = 'https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:cIdHere' if celex == celex: # nan check if ";" in celex: @@ -353,14 +308,11 @@ def get_entire_page(celex): except Exception: return "No data available" - -""" -This Method gets the subject matter from a fragment of code containing them. -Used for extracting subject matter for cellar cases only. -""" - - def get_subject(text): + """ + This Method gets the subject matter from a fragment of code containing them. + Used for extracting subject matter for cellar cases only. + """ try: index_matter = text.index("Subject matter:") try: @@ -375,14 +327,11 @@ def get_subject(text): subject = "" return subject - -""" -This Method extracts all eurovocs, from a fragment containing them. -Used for extracting eurovoc for cellar cases. -""" - - def get_eurovoc(text): + """ + This Method extracts all eurovocs, from a fragment containing them. + Used for extracting eurovoc for cellar cases. + """ try: start = text.find("EUROVOC") try: @@ -408,14 +357,11 @@ def get_eurovoc(text): except Exception: return "" - -""" -Method for getting all of the case directory codes for each cellar case. -Extracts them from a string containing the eurlex website containing all document information. -""" - - def get_codes(text): + """ + Method for getting all of the case directory codes for each cellar case. + Extracts them from a string containing the eurlex website containing all document information. + """ try: index_codes = text.index("Case law directory code:") index_end = text.index("Miscellaneous information") @@ -431,11 +377,8 @@ def get_codes(text): index_start = indexes[x] getting_ending = extracting[index_start:] words_here = getting_ending.split() - for words in words_here: - if words is not words_here[0]: - if is_code(words): ending = getting_ending[2:].find(words) done = True @@ -444,14 +387,12 @@ def get_codes(text): code_text = getting_ending[:ending] else: code_text = getting_ending - codes_result.append(code_text.replace("\n", "")) code = ";".join(codes_result) except Exception: code = "" return code - def get_advocate_or_judge(text, phrase): """ :param text: full text of the info page of a case from eur-lex website @@ -475,7 +416,8 @@ def get_advocate_or_judge(text, phrase): def get_case_affecting(text): """ :param text: full text of the info page of a case from eur-lex website - :return: The celex id's of case affecting listed + entire string data with more information about the case affecting + :return: The celex id's of case affecting listed + entire string data with + more information about the case affecting """ phrase = 'Case affecting:' try: @@ -523,6 +465,5 @@ def get_citations_with_extra_info(text): data_list.append(fixed_line) else: return ";".join(data_list) - except: - return '' + return '' \ No newline at end of file From c355d00a9c38cc31e3ee6dbf45b16bd53b6f65c5 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:42:40 +0100 Subject: [PATCH 16/78] Linted code, corrected variable names that are similar to inbuilt references for fulltext_saving --- cellar/cellar_extractor/fulltext_saving.py | 115 ++++++++++----------- 1 file changed, 57 insertions(+), 58 deletions(-) diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py index 44af01c..1a9eaa8 100644 --- a/cellar/cellar_extractor/fulltext_saving.py +++ b/cellar/cellar_extractor/fulltext_saving.py @@ -1,22 +1,22 @@ -import pandas as pd +import json import threading +import time +import pandas as pd from cellar_extractor.eurlex_scraping import * -import json from tqdm import tqdm -import time - -""" -This is the method executed by individual threads by the add_sections method. - -The big dataset is divided in parts, each thread gets its portion of work to do. -They add their portions of columns to corresponding lists, -after all the threads are done the individual parts are put together. -""" -def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, list_codes, list_eurovoc, list_adv, - list_judge, list_affecting_id, list_affecting_str,list_citations_extra, progress_bar): - sum = pd.Series([], dtype='string') +def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, + list_codes, list_eurovoc, list_adv, list_judge, + list_affecting_id, list_affecting_str,list_citations_extra, + progress_bar): + """ + This is the method executed by individual threads by the add_sections method. + The big dataset is divided in parts, each thread gets its portion of work to do. + They add their portions of columns to corresponding lists, + after all the threads are done the individual parts are put together. + """ + _sum = pd.Series([], dtype='string') key = pd.Series([], dtype='string') full = list() case_codes = pd.Series([], dtype='string') @@ -28,34 +28,34 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, citations_extra = pd.Series([], dtype='string') for i in range(len(celex)): j = start + i - id = celex[j] + _id = celex[j] ecli = eclis[j] - html = get_html_text_by_celex_id(id) + html = get_html_text_by_celex_id(_id) if html != "404": text = get_full_text_from_html(html) json_text = { - 'celex': str(id), + 'celex': str(_id), 'ecli': ecli, 'text': text } full.append(json_text) else: json_text = { - 'celex': str(id), + 'celex': str(_id), 'ecli': ecli, 'text': "" } full.append(json_text) - summary = get_summary_html(id) + summary = get_summary_html(_id) if summary != "No summary available": text = get_keywords_from_html(summary, id[0]) text2 = get_summary_from_html(summary, id[0]) key[j] = text - sum[j] = text2 + _sum[j] = text2 else: key[j] = "" - sum[j] = "" - entire_page = get_entire_page(id) + _sum[j] = "" + entire_page = get_entire_page(_id) text = get_full_text_from_html(entire_page) if entire_page != "No data available": code = get_codes(text) @@ -82,7 +82,7 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, citations_extra[j] = citation_extra progress_bar.update(1) - list_sum.append(sum) + list_sum.append(_sum) list_key.append(key) list_full.append(full) list_codes.append(case_codes) @@ -93,51 +93,53 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, list_affecting_str.append(affecting_str) list_citations_extra.append(citations_extra) -""" -This method adds the following sections to a pandas dataframe, as separate columns: - -Full Text -Case law directory codes -Keywords -Summary -Advocate General -Judge Rapporteur -Case affecting (CELEX ID) -Case affecting string (entire str with more info) - -Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html. -It operates with multiple threads, using that feature is recommended as it speeds up the entire process. -""" +def add_sections(data, threads, json_filepath=None): + """ + This method adds the following sections to a pandas dataframe, as separate columns: + Full Text + Case law directory codes + Keywords + Summary + Advocate General + Judge Rapporteur + Case affecting (CELEX ID) + Case affecting string (entire str with more info) -def add_sections(data, threads, json_filepath=None): + Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html. + It operates with multiple threads, using that feature is recommended as it speeds up + the entire process. + """ celex = data.loc[:, 'CELEX IDENTIFIER'] eclis = data.loc[:, 'ECLI'] length = celex.size time.sleep(1) - bar = tqdm(total=length, colour="GREEN", miniters=int(length/100), position=0, leave=True, maxinterval=10000) + _bar = tqdm(total=length, colour="GREEN", miniters=int(length/100), + position=0, leave=True, maxinterval=10000) if length > threads: # to avoid getting problems with small files at_once_threads = int(length / threads) else: at_once_threads = length threads = [] - list_sum = list() - list_key = list() - list_full = list() - list_codes = list() - list_eurovoc = list() - list_adv = list() - list_judge = list() - list_affecting_id = list() - list_affecting_str = list() - list_citations_extra = list() + list_sum = [] + list_key = [] + list_full = [] + list_codes = [] + list_eurovoc = [] + list_adv = [] + list_judge = [] + list_affecting_id = [] + list_affecting_str = [] + list_citations_extra = [] for i in range(0, length, at_once_threads): curr_celex = celex[i:(i + at_once_threads)] curr_ecli = eclis[i:(i + at_once_threads)] t = threading.Thread(target=execute_sections_threads, args=( - curr_celex, curr_ecli, i, list_sum, list_key, list_full, list_codes, list_eurovoc, - list_adv, list_judge, list_affecting_id, list_affecting_str,list_citations_extra, bar)) + curr_celex, curr_ecli, i, list_sum, list_key, list_full, + list_codes, list_eurovoc, list_adv, list_judge, + list_affecting_id, list_affecting_str, + list_citations_extra, _bar)) threads.append(t) for t in threads: t.start() @@ -164,13 +166,10 @@ def add_sections(data, threads, json_filepath=None): json_file.extend(l) return json_file - -""" -Used for adding columns easier to a dataframe for add_sections(). -""" - - def add_column_frow_list(data, name, list): + """ + Used for adding columns easier to a dataframe for add_sections(). + """ column = pd.Series([], dtype='string') for l in list: column = column.append(l) From acd033e9aec63d2738ec1ba58434362067f5c5a1 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:43:12 +0100 Subject: [PATCH 17/78] Linted code for json_to_csv file --- cellar/cellar_extractor/json_to_csv.py | 55 +++++++++++++------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py index 1a1e62e..8b2e301 100644 --- a/cellar/cellar_extractor/json_to_csv.py +++ b/cellar/cellar_extractor/json_to_csv.py @@ -1,40 +1,44 @@ import csv import re +import sys import warnings +from io import StringIO from bs4 import BeautifulSoup -import sys import pandas as pd -from io import StringIO + warnings.filterwarnings("ignore") -X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT', 'CASE LAW HAS A TYPE OF PROCEDURE', - 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE', 'CASE LAW USES LANGUAGE OF PROCEDURE', - 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE', 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT', +X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT', + 'CASE LAW HAS A TYPE OF PROCEDURE', 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE', + 'CASE LAW USES LANGUAGE OF PROCEDURE', 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE', + 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT', 'CASE LAW ORIGINATES IN COUNTRY OR USES A ROLE QUALIFIER', 'CASE LAW ORIGINATES IN COUNTRY', - 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'RELATED JOURNAL ARTICLE', - 'CASE LAW DELIVERED BY ADVOCATE GENERAL', 'CASE LAW DELIVERED BY JUDGE', 'ECLI', - 'CASE LAW INTERPRETS LEGAL RESOURCE', 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION', - 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER', 'SECTOR IDENTIFIER', - 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ', - 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION', + 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', + 'RELATED JOURNAL ARTICLE', 'CASE LAW DELIVERED BY ADVOCATE GENERAL', + 'CASE LAW DELIVERED BY JUDGE', 'ECLI', 'CASE LAW INTERPRETS LEGAL RESOURCE', + 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION', + 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER', + 'SECTOR IDENTIFIER', 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE', + 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ', 'LEGACY DATE OF CREATION OF WORK', + 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION', 'LAST CMR MODIFICATION DATE', 'CASE LAW HAS CONCLUSIONS'] Y = ['LEGAL RESOURCE HAS TYPE OF ACT', 'WORK HAS RESOURCE TYPE', 'CASE LAW ORIGINATES IN COUNTRY', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'ECLI', 'REFERENCE TO PROVISIONS OF NATIONAL LAW', - 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER', 'SECTOR IDENTIFIER', - 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK IS CREATED BY AGENT (AU)', - 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE', - 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT', 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT', + 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER', + 'SECTOR IDENTIFIER', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', + 'WORK IS CREATED BY AGENT (AU)', 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', + 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE', + 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT', + 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT', 'CASE LAW BASED ON A LEGAL INSTRUMENT', 'PARTIES OF THE CASE LAW'] COLS = set(X + Y) COLS = sorted(COLS) -""" -Method used after the json to csv conversion, to save the file in the processed directory. -""" - - def create_csv(filepath, encoding="UTF8", data=None): + """ + Method used after the json to csv conversion, to save the file in the processed directory. + """ if data != "": csv_file = open(filepath, 'w', encoding=encoding) csv_writer = csv.writer(csv_file) @@ -42,14 +46,11 @@ def create_csv(filepath, encoding="UTF8", data=None): csv_writer.writerows(data) csv_file.close() - -""" -Method used to transform the json file received from cellar_extraction to a csv file. -Cellar specific, sets specific columns with names defined at the beginning of file as COLS. -""" - - def json_to_csv(json_data): + """ + Method used to transform the json file received from cellar_extraction to a csv file. + Cellar specific, sets specific columns with names defined at the beginning of file as COLS. + """ final_data = [] for data in json_data: ecli_data = json_data[data] From f541100e1a822b1450f163b637a0ec598568cc9e Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:43:44 +0100 Subject: [PATCH 18/78] Linted code, changed for loop to use enumerate rather than range and len. --- cellar/cellar_extractor/nodes_and_edges.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py index 9578c36..843883e 100644 --- a/cellar/cellar_extractor/nodes_and_edges.py +++ b/cellar/cellar_extractor/nodes_and_edges.py @@ -1,10 +1,13 @@ import pandas as pd + def extract_containing_subject_matter(df,phrase): returner = df[df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.contains(phrase, na=False)] return returner + def get_df_with_celexes(df,celexes): returner = df[df['CELEX IDENTIFIER'].isin(celexes)] return returner + def get_edges_list(df): extraction = df[['CELEX IDENTIFIER','citing']] extraction.reset_index(inplace=True) @@ -12,7 +15,7 @@ def get_edges_list(df): vals = extraction['citing'] nodes = set() edges = list() - for i in range(len(keys)): + for i in enumerate(keys): k = keys[i] val = vals[i] if val == val: @@ -24,6 +27,7 @@ def get_edges_list(df): else: pass return edges, list(nodes) + def get_nodes_and_edges(df): edges, nodes = get_edges_list(df) #nodes = get_df_with_celexes(df,celexes) From 14ffecc3604982ab9465eb2377d8daae25aefd24 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:45:04 +0100 Subject: [PATCH 19/78] Linted code, changed conditions from != None to is not None for code conformity, changed variable names to avoid inbuilt references. --- .../cellar_extractor/operative_extractions.py | 238 +++++++++--------- 1 file changed, 112 insertions(+), 126 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index deb0b22..1bb5f43 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -1,155 +1,145 @@ - -import requests -from bs4 import BeautifulSoup - import csv import json - +import requests +from bs4 import BeautifulSoup class Analyzer(): """ - This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. + This class returns a list of the operative part for a given celex id. + Celex id is initialized through a constructor. """ celex: str # declare celex as a string - - def __init__(self, celex): # Initialize Celex id as a constructor , passed when calling the class + url: str # declare url as a string + def __init__(self, celex): + # Initialize Celex id as a constructor, passed when calling the class self.celex = celex - + self.url = "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A\ + {self.celex}&from=EN" def html_page_structure_one(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a nested table structure . The relevant text lies inside the coj-bold class of the span tag. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('table') # Find all tables tag from the website one = [] for divs in div: # Find each nested table within the table table = divs.find('table') - if table != None: + if table is not None: # Find all p under the nested table with the coj-normal class p = table.find_all('p', class_="coj-normal") for x in p: # Span class of coj-bold under the p tag span = x.find_all('span', class_="coj-bold") for y in span: - if x != None and y != None: - + if x is not None and y is not None: # append text from span onto a list one.append(y.text) return one def html_page_structure_two(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which + comes after the keyword operative of the previous span tag. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') two = [] for para in p: - span = para.find('span') - if span != None: - + if span is not None: if "operative" in span.text.lower(): normal = span.find_all_next('p', class_="normal") for op in normal: - two.append(op.text) return two def structure_three(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested - table structure . The relevant text lies inside the coj-bold class of the span tag. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a nested + table structure. The relevant text lies inside the coj-bold class of the span tag. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') table = parser.find_all('table') three = [] for tables in table: interior = tables.find_all('table') for interiors in interior: - if interiors != None: + if interiors is not None: p = interiors.find_all('p', class_="coj-normal") for x in p: span = x.find_all('span', class_="coj-bold") for y in span: - if x != None and y != None: - + if x is not None and y is not None: three.append(y.text) return three def structure_four(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the p tag which comes after the + keyword operative of the previous span tag. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') four = [] for para in p: - span = para.find('span') - if span != None: - + if span is not None: if "operative" in span.text.lower(): normal = span.find_all_next('table') for op in normal: tbody = op.find('tbody') new_p = tbody.find_all('p', class_="oj-normal") - for subsequent in new_p: - if subsequent != None: - + if subsequent is not None: four.append(subsequent.text) - return four def structure_five(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure. The relevant text lies inside the normal class of the p tag which + comes after the keyword operative of the previous span tag. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') five = [] for para in p: span = para.find('span') - if span != None: - + if span is not None: if "operative" in span.text.lower(): normal = span.find_all_next('table') for op in normal: tbody = op.find('tbody') new_p = tbody.find_all('p', class_="normal") - for subsequent in new_p: - if subsequent != None: - + if subsequent is not None: five.append(subsequent.text) return five def structure_six(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 - (header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a h2 (header) structure. + The relevant text lies inside thee p tag which comes after the keyword operative + part of the respective h2 tag. """ - - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('h2') six = [] @@ -164,11 +154,12 @@ def structure_six(self) -> list: def structure_seven(self) -> list: """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table - (table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a table + (table) structure. The relevant text lies inside the span tag which comes after + the p tag , with the class name=normal. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('table') seven = [] @@ -176,68 +167,66 @@ def structure_seven(self) -> list: # find tbody within the table table = divs.find_all('tbody') for tables in table: - if tables != None: + if tables is not None: # find tr within the tbody p = tables.find_all('tr') for x in p: - if x != None: + if x is not None: # find td within the tr td = x.find_all('td') for y in td: - if y != None: + if y is not None: p = y.find_all('p', class_="normal") - for all in p: - if all != None: + for _all in p: + if _all is not None: # find operative part within the span - span = all.find_all( + span = _all.find_all( 'span', class_="bold") for spans in span: - # APpend it into a list and return the list when the function is called + # Append it into a list and return the + # list when the function is called seven.append(spans.text) return seven def structure_eight(self) -> list: """ - This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside + This function retreives operative part from documents of the respected celex id's. + The text is extracted from the span tag nested inside the tbody tag.Returns a list as output. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') tbody = parser.find_all('tbody') eight = [] - for all in tbody: - if all != None: - tr = all.find_all('tr') + for _all in tbody: + if _all is not None: + tr = _all.find_all('tr') for trs in tr: - if trs != None: - + if trs is not None: p = parser.find_all('p', class_="normal") for paras in p: - if paras != None: + if paras is not None: if "on those grounds" in paras.text.lower(): - span = paras.find_all_next( 'span', class_="bold") for spans in span: - if spans != None: + if spans is not None: eight.append(spans.text) - return eight def structure_nine(self) -> list: """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) + This function retreives operative part from documents of the respected celex id's. + The operative part is under the bold(b) tag after the p tag where the keywords "on those grounds" exist. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') nine = [] div = parser.find_all('p') for divs in div: - if divs != None: + if divs is not None: if "on those grounds" in divs.text.lower(): b = divs.find_all_next('b') for bolds in b: @@ -247,34 +236,34 @@ def structure_nine(self) -> list: def structure_eleven(self) -> list: """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) + This function retreives operative part from documents of the respected celex id's. + The operative part is under the paragraph(p) tag after the b tag where the keywords "operative part" exist. """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') bold = parser.find_all('b') eleven = [] for b in bold: - if b != None: + if b is not None: if "operative part" in b.text.lower(): - table = b.find_all_next('p') - for tables in table: - if tables != None: - eleven.append(tables.text) - + tables = b.find_all_next('p') + for table in tables: + if table is not None: + eleven.append(table.text) return eleven def structure_ten(self): """ - This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s - server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase + This function retreives operative part from documents of the respected celex id's. + Since the ocntent is preloaded using js/clients + server side functions , the text from the current page is retrieved and the + operative part is scraped after the occurence of the phrase "On those grounds". """ - website = requests.get( - f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text + website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text parser = BeautifulSoup(website, 'lxml') appender = [] for string in parser.stripped_strings: @@ -282,7 +271,7 @@ def structure_ten(self): appender.append(string) found = False - afterGrounds = [] + after_grounds = [] for x in appender: @@ -291,33 +280,32 @@ def structure_ten(self): if found: if len(x.split(" ")) > 3: - afterGrounds.append(x) - return afterGrounds + after_grounds.append(x) + return after_grounds def __call__(self) -> list: """ - This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . + This inbuilt __call__ function loops through all the methods in the class + `Analyzer` and returns the list , with values of the operative part . """ - container = [self.html_page_structure_one(), self.html_page_structure_two(), self.structure_three(), self.structure_four(), self.structure_five(), - self.structure_six(), self.structure_seven(), self.structure_eight(), self.structure_nine(), self.structure_ten(), self.structure_eleven()] + container = [self.html_page_structure_one(), self.html_page_structure_two(), + self.structure_three(), self.structure_four(), self.structure_five(), + self.structure_six(), self.structure_seven(), self.structure_eight(), + self.structure_nine(), self.structure_ten(), self.structure_eleven()] one: list - for funcs in range(len(container)): - + for funcs in enumerate(container): one = container[funcs] - if one: if (len(one) != 0 or one[0] != "\n"): print("here") return one - - - - + return None class Writing(): """ - This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) + This class has different methods, for the purpose of writing the operative part + into different file formats.(Csv,txt,json) """ instance: str @@ -330,38 +318,36 @@ def __init__(self, celex: str): self.x = self.instance() def to_csv(self): - file = open("csv/output.csv", "a+") - writer = csv.writer(file) - - if self.x != None: + _file = open("csv/output.csv", "a+", encoding="utf-8") + writer = csv.writer(_file) + if self.x is not None: writer.writerow([self.celex, self.x]) def to_json(self): - if self.x != None: + if self.x is not None: data = {'Celex': self.celex, "Operative part": self.x} - file = open('json/data.json', 'a+') - json.dump(data, file) - file.close() + _file = open('json/data.json', 'a+', encoding='utf-8') + json.dump(data, _file) + _file.close() def to_txt(self): - if self.x != None: - file = open(f"txt/{self.celex}.txt", "a") + if self.x is not None: + _file = open(f"txt/{self.celex}.txt", "a", encoding="utf-8") for w in self.x: - - file.write(w+"\n") - file.close() + _file.write(w+"\n") + _file.close() # Sample code for reading celex id's froma tsv file -file = open("gijs_202310_node_list.tsv", "r") +file = open("gijs_202310_node_list.tsv", "r", encoding="utf-8") reader = csv.reader(file) testing = [] for row in reader: for rows in row: if "Id" not in rows: testing.append(rows.split("\t")[0]) -for all in testing: - instance = Writing(all) +for _all in testing: + instance = Writing(_all) instance.to_csv() - print(all) + print(_all) From 2886aef8687c0dc20c8bf8d966342de80f532bc0 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:45:22 +0100 Subject: [PATCH 20/78] code linting for sparql file --- cellar/cellar_extractor/sparql.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/cellar/cellar_extractor/sparql.py b/cellar/cellar_extractor/sparql.py index de989ce..c41a49c 100644 --- a/cellar/cellar_extractor/sparql.py +++ b/cellar/cellar_extractor/sparql.py @@ -1,5 +1,6 @@ from SPARQLWrapper import SPARQLWrapper, JSON, CSV, POST import requests + def run_eurlex_webservice_query(query_input,username,password): target = "https://eur-lex.europa.eu/EURLexWebService?wsdl" query = ''' @@ -22,15 +23,12 @@ def run_eurlex_webservice_query(query_input,username,password): ''' % (username, password,query_input) return requests.request("POST", target, data=query, allow_redirects=True) - -""" -Method acquired from a different law and tech project for getting the citations of a source_celex. -Unlike get_citations_csv, only works for one source celex at once. Returns a set containing all the works cited by -the source celex. -""" - def get_citations(source_celex, cites_depth=1, cited_depth=1): """ + Method acquired from a different law and tech project for getting the citations of a + source_celex. + Unlike get_citations_csv, only works for one source celex at once. Returns a set + containing all the works cited by the source celex. Gets all the citations one to X steps away. Hops can be specified as either the source document citing another (defined by `cites_depth`) or another document citing it (`cited_depth`). Any numbers higher than 1 denote that new source document @@ -69,18 +67,16 @@ def get_citations(source_celex, cites_depth=1, cited_depth=1): for bind in ret['results']['bindings']: target = bind['name2']['value'] targets.add(target) - targets = set([el for el in list(targets)]) # Filters the list. Filter type: '3'=legislation, '6'=case law. - + # Filters the list. Filter type: '3'=legislation, '6'=case law. + targets = set([el for el in list(targets)]) return targets - -""" -Method sending a query to the endpoint, which asks for cited works for each celex. -The celex variable in the method is a list of all the celex identifiers of the cases we need the citations of. -The query returns a csv, containing all of the data needed.""" - - def get_citations_csv(celex): + """ + Method sending a query to the endpoint, which asks for cited works for each celex. + The celex variable in the method is a list of all the celex identifiers of the + cases we need the citations of. + The query returns a csv, containing all of the data needed.""" endpoint = 'https://publications.europa.eu/webapi/rdf/sparql' input_celex = '", "'.join(celex) query = ''' From 6ad2cab2cce17b6b3c1281260c635757e078a8f1 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:45:50 +0100 Subject: [PATCH 21/78] Code linting for Testing file --- cellar/cellar_extractor/Testing_file.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py index 9418533..bf932a8 100644 --- a/cellar/cellar_extractor/Testing_file.py +++ b/cellar/cellar_extractor/Testing_file.py @@ -1,10 +1,6 @@ """ - This file is purely a testing file for trying out separate parts of code, testing if everything works and such. Can be also used to develop future code. - - - """ from nodes_and_edges import get_nodes_and_edges @@ -27,11 +23,14 @@ for rows in row: if "Id" not in rows: testing.append(rows.split("\t")[0]) + class Test(unittest.TestCase): """ - class for unittesing operative part , it checks whether the list returns null value or has some value. + class for unittesing operative part , it checks whether the list returns null value + or has some value. """ ids:list + def __init__(self,ids): self.ids=ids @@ -48,24 +47,19 @@ def test_for_celex_id(self): # self.assertFalse(len(test_instance)<=1) try: - self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") + self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") count_pass+=1 print(f"{id} ---> PASSED.") except: - print(f"{id} ---> FAILED.") - print(f"Passed {count_pass}/{len(self.ids)} times") - # print(len(self.ids)-count,"were passed successfully") + print(f"{id} ---> FAILED.") + print(f"Passed {count_pass}/{len(self.ids)} times") + # print(len(self.ids)-count,"were passed successfully") new_list=[] for w in range(no_of_test_cases): randomized=random.randint(0,len(testing)-1) new_list.append(testing[randomized]) - - - - - if __name__ == '__main__': celex = "62004CJ0292" From db8688902e5d1928d81cc3f6ac5e6274bd603a78 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Thu, 28 Mar 2024 08:46:55 +0100 Subject: [PATCH 22/78] Update gitignore file to not consider DS_Store files and venv directories --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index aa30a5f..06a87e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -venv +.venv* .idea data rechtspraak/rechtspraak_extractor/tests/data @@ -20,4 +20,6 @@ rechtspraak.zip build.bat echr_extractor-whl.zip echr_extractor-whl -echr_extractor.egg-info \ No newline at end of file +echr_extractor.egg-info + +.*DS_Store \ No newline at end of file From b1f24ee8e02a56fe37726caac6e4426fadf07b3d Mon Sep 17 00:00:00 2001 From: Vishal Venkat Raghavan Date: Wed, 10 Apr 2024 10:54:46 +0200 Subject: [PATCH 23/78] Unittests for operative part --- tests.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests.py b/tests.py index 4732cdb..8c36e70 100644 --- a/tests.py +++ b/tests.py @@ -1,4 +1,5 @@ from cellar_extractor import * +from extraction_libraries.cellar.cellar_extractor.operative_extractions import Writing def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -70,3 +71,39 @@ def test_cellar_json_n(): assert True except Exception: assert False, "Downloading cellar as json failed." + + +# from operative_extractions import Analyzer,Writing + +import random +import csv +import json + +celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + +celex:str +choice=random.randint(0,len(celex_store)) +celex=celex_store[choice] +def operative_part_csv(celex)->csv: + + csv_store=Writing(celex) + csv_store.to_csv() + if csv_store.to_csv(): + assert True + else: + assert False +def operative_part_json(celex)->json: + json_store=Writing(celex) + json_store.to_json() + if json_store.to_json(): + assert True + else: + assert False + +def operative_part_txt(celex): + txt_store=Writing(celex) + txt_store.to_txt() + if txt_store.to_txt(): + assert True + else: + assert False \ No newline at end of file From e88255c0a14ea8e19c6e7fdf161ca6dd3b0fb490 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:20:12 +0200 Subject: [PATCH 24/78] Update operative_extractions.py --- .../cellar_extractor/operative_extractions.py | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index 1bb5f43..a0e3610 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -311,6 +311,25 @@ class Writing(): instance: str x: str parameter: str + txt = "txt" + json="json" + csv="csv" + txt_dir = os.path.join(current, txt) + csv_dir = os.path.join(current, csv) + json_dir = os.path.join(current, json) + + if not os.path.exists(txt_dir): + os.makedirs(txt_dir) + if not os.path.exists(csv_dir): + os.makedirs(csv_dir) + + if not os.path.exists(json_dir): + os.makedirs(json_dir) + + def __init__(self, celex: str): + self.celex = celex + self.instance = Analyzer(self.celex) + self.x = self.instance() def __init__(self, celex: str): self.celex = celex @@ -318,7 +337,7 @@ def __init__(self, celex: str): self.x = self.instance() def to_csv(self): - _file = open("csv/output.csv", "a+", encoding="utf-8") + _file = open("output.csv", "a+", encoding="utf-8") writer = csv.writer(_file) if self.x is not None: writer.writerow([self.celex, self.x]) @@ -326,7 +345,7 @@ def to_csv(self): def to_json(self): if self.x is not None: data = {'Celex': self.celex, "Operative part": self.x} - _file = open('json/data.json', 'a+', encoding='utf-8') + _file = open('data.json', 'a+', encoding='utf-8') json.dump(data, _file) _file.close() @@ -337,17 +356,4 @@ def to_txt(self): for w in self.x: _file.write(w+"\n") _file.close() -# Sample code for reading celex id's froma tsv file - - -file = open("gijs_202310_node_list.tsv", "r", encoding="utf-8") -reader = csv.reader(file) -testing = [] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) -for _all in testing: - instance = Writing(_all) - instance.to_csv() - print(_all) + From 09865ff1173314b534c5717b176f1fd9d02e9d7a Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:20:42 +0200 Subject: [PATCH 25/78] Update __init__.py --- cellar/cellar_extractor/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py index 39184aa..ab2ae68 100644 --- a/cellar/cellar_extractor/__init__.py +++ b/cellar/cellar_extractor/__init__.py @@ -2,5 +2,6 @@ from cellar_extractor.cellar import get_cellar_extra from cellar_extractor.cellar import get_nodes_and_edges_lists from cellar_extractor.cellar import filter_subject_matter +from cellar_extractor.operative_extractions import Analyzer,Writing import logging -logging.basicConfig(level=logging.INFO) \ No newline at end of file +logging.basicConfig(level=logging.INFO) From 0d7c77cc670f70dea57632eef342cd49d50d7b3b Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:26:46 +0200 Subject: [PATCH 26/78] Update tests.py --- tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests.py b/tests.py index 8c36e70..be3cef4 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,4 @@ from cellar_extractor import * -from extraction_libraries.cellar.cellar_extractor.operative_extractions import Writing def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -73,7 +72,6 @@ def test_cellar_json_n(): assert False, "Downloading cellar as json failed." -# from operative_extractions import Analyzer,Writing import random import csv @@ -106,4 +104,4 @@ def operative_part_txt(celex): if txt_store.to_txt(): assert True else: - assert False \ No newline at end of file + assert False From 01f7d84f78a6de415bba49c9bde884d929047a09 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:29:06 +0200 Subject: [PATCH 27/78] Update tests.py --- tests.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests.py b/tests.py index be3cef4..c71e4ea 100644 --- a/tests.py +++ b/tests.py @@ -105,3 +105,20 @@ def operative_part_txt(celex): assert True else: assert False + + def test_for_celex_id(celex): + + count_fail:int + count_pass=0 + for id in self.ids: + test_output=Analyzer(celex) + test_instance=test_output() + + + try: + assert True + + + except: + assert False + From 642dbe31150486dc4bf144cb459d5324da2ce910 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:29:33 +0200 Subject: [PATCH 28/78] Delete cellar/cellar_extractor/Testing_file.py --- cellar/cellar_extractor/Testing_file.py | 75 ------------------------- 1 file changed, 75 deletions(-) delete mode 100644 cellar/cellar_extractor/Testing_file.py diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py deleted file mode 100644 index 4d0372a..0000000 --- a/cellar/cellar_extractor/Testing_file.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -This file is purely a testing file for trying out separate parts of code, testing if everything works and such. -Can be also used to develop future code. -""" - -from nodes_and_edges import get_nodes_and_edges -from os.path import join -from json_to_csv import read_csv -import time -from eurlex_scraping import * -from cellar import * -from sparql import * -import unittest -from operative_extraction import Analyzer -# from test import testing -import random -import csv -file=open("gijs_202310_node_list.tsv","r") -reader=csv.reader(file) -no_of_test_cases=30 -testing=[] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) - -class Test(unittest.TestCase): - """ - class for unittesing operative part , it checks whether the list returns null value - or has some value. - """ - ids:list - - def __init__(self,ids): - self.ids=ids - - def test_for_celex_id(self): - """ - Main function which runs the unittest Testcase . - """ - count_fail:int - count_pass=0 - for id in self.ids: - test_output=Analyzer(id) - test_instance=test_output() - - # self.assertFalse(len(test_instance)<=1) - - try: - self.assertTrue(test_instance[0],f"{id} is not empty and has operative part") - count_pass+=1 - print(f"{id} ---> PASSED.") - except: - print(f"{id} ---> FAILED.") - print(f"Passed {count_pass}/{len(self.ids)} times") - # print(len(self.ids)-count,"were passed successfully") - -new_list=[] -for w in range(no_of_test_cases): - randomized=random.randint(0,len(testing)-1) - new_list.append(testing[randomized]) - -if __name__ == '__main__': - celex = "62004CJ0292" - - instance=Test([celex]) - instance.test_for_celex_id() - site = get_entire_page(celex) - text = get_full_text_from_html(site) - cits = get_citations_with_extra_info(text) - print(cits) - data = get_cellar(sd='2023-01-01',max_ecli=100,save_file='n') - d3 = filter_subject_matter(data, "prices") - b=2 - pass \ No newline at end of file From 8db810d59cf5d235d7a488138b46f9012b46ea9f Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:33:09 +0200 Subject: [PATCH 29/78] Update operative_extractions.py --- cellar/cellar_extractor/operative_extractions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index a0e3610..4c46124 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -1,6 +1,7 @@ import csv import json import requests +import os from bs4 import BeautifulSoup class Analyzer(): From 3c32aaadadcd4cdcdbadf2a19a488e92d30ef7c8 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:36:44 +0200 Subject: [PATCH 30/78] Update tests.py --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index c71e4ea..861c701 100644 --- a/tests.py +++ b/tests.py @@ -106,7 +106,7 @@ def operative_part_txt(celex): else: assert False - def test_for_celex_id(celex): +def test_for_operative_part(celex): count_fail:int count_pass=0 From 28858a6ea9a3699dd92db70847d6aedd848c63f5 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:45:46 +0200 Subject: [PATCH 31/78] Update tests.py --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index 861c701..08def79 100644 --- a/tests.py +++ b/tests.py @@ -110,7 +110,7 @@ def test_for_operative_part(celex): count_fail:int count_pass=0 - for id in self.ids: + for id in celex_store: test_output=Analyzer(celex) test_instance=test_output() From 98ea00cf27540f6f8350ec5dfa630d2745e6ddf9 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:52:25 +0200 Subject: [PATCH 32/78] Update tests.py --- tests.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/tests.py b/tests.py index 08def79..f56f41a 100644 --- a/tests.py +++ b/tests.py @@ -86,24 +86,27 @@ def operative_part_csv(celex)->csv: csv_store=Writing(celex) csv_store.to_csv() - if csv_store.to_csv(): - assert True - else: - assert False + try: + if csv_store.to_csv(): + assert True + except Exception: + assert False def operative_part_json(celex)->json: json_store=Writing(celex) json_store.to_json() - if json_store.to_json(): - assert True - else: + try: + if json_store.to_json(): + assert True + except Exception: assert False def operative_part_txt(celex): txt_store=Writing(celex) txt_store.to_txt() - if txt_store.to_txt(): - assert True - else: + try: + if txt_store.to_txt(): + assert True + except Exception: assert False def test_for_operative_part(celex): @@ -111,14 +114,16 @@ def test_for_operative_part(celex): count_fail:int count_pass=0 for id in celex_store: - test_output=Analyzer(celex) - test_instance=test_output() + try: + test_output=Analyzer(celex) + test_instance=test_output() assert True - except: + except Exception: assert False - + + From 8cae2fa368e901b4c5aa82d65e9b849b66161542 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:32:53 +0200 Subject: [PATCH 33/78] Update tests.py --- tests.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests.py b/tests.py index f56f41a..92ddfcd 100644 --- a/tests.py +++ b/tests.py @@ -125,5 +125,9 @@ def test_for_operative_part(celex): except Exception: assert False - + +operative_part_txt(celex) +operative_part_csv(celex) +operative_part_json(celex) +test_for_operative_part(celex) From 8503d8d23a836875e09e3c5b821ce13f6dd15df6 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:34:18 +0200 Subject: [PATCH 34/78] Update tests.py From 87271b8181dd968feba98ebd017c26495311f919 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:39:53 +0200 Subject: [PATCH 35/78] Update tests.py --- tests.py | 46 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/tests.py b/tests.py index 92ddfcd..0f0ab69 100644 --- a/tests.py +++ b/tests.py @@ -109,7 +109,7 @@ def operative_part_txt(celex): except Exception: assert False -def test_for_operative_part(celex): +def for_operative_part(celex): count_fail:int count_pass=0 @@ -126,8 +126,44 @@ def test_for_operative_part(celex): except Exception: assert False -operative_part_txt(celex) -operative_part_csv(celex) -operative_part_json(celex) -test_for_operative_part(celex) + +def test_operative_part_txt(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)) + celex=celex_store[choice] + if operative_part_txt(celex): + assert True + except Exception: + assert False +def test_operative_part_json(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)) + celex=celex_store[choice] + if operative_part_json(celex): + assert True + except Exception: + assert False +def test_operative_part_csv(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)) + celex=celex_store[choice] + if operative_part_csv(celex): + assert True + except Exception: + assert False + +def test_for_operative_part(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)) + celex=celex_store[choice] + if test_for_operative_part(celex): + assert True + except Exception: + assert False + + From c3daf22eaa7099094ed090e8fe33d85d8d8afdb1 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:42:22 +0200 Subject: [PATCH 36/78] Update tests.py --- tests.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests.py b/tests.py index 0f0ab69..844f9b9 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,7 @@ from cellar_extractor import * - +import random +import csv +import json def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -73,15 +75,9 @@ def test_cellar_json_n(): -import random -import csv -import json -celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] -celex:str -choice=random.randint(0,len(celex_store)) -celex=celex_store[choice] + def operative_part_csv(celex)->csv: csv_store=Writing(celex) From 428f466bab6210c3d424c3db3de09a770f5b32ab Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:27:50 +0200 Subject: [PATCH 37/78] Update tests.py --- tests.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests.py b/tests.py index 844f9b9..7296857 100644 --- a/tests.py +++ b/tests.py @@ -128,8 +128,9 @@ def test_operative_part_txt(): celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] - if operative_part_txt(celex): - assert True + try: + if operative_part_txt(celex): + assert True except Exception: assert False def test_operative_part_json(): @@ -137,8 +138,9 @@ def test_operative_part_json(): celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] - if operative_part_json(celex): - assert True + try: + if operative_part_json(celex): + assert True except Exception: assert False def test_operative_part_csv(): @@ -146,8 +148,9 @@ def test_operative_part_csv(): celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] - if operative_part_csv(celex): - assert True + try: + if operative_part_csv(celex): + assert True except Exception: assert False @@ -156,8 +159,9 @@ def test_for_operative_part(): celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] - if test_for_operative_part(celex): - assert True + try: + if test_for_operative_part(celex): + assert True except Exception: assert False From b75fe86a7d68ec1caf33c05d5d551f29f541ddab Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 14:04:00 +0200 Subject: [PATCH 38/78] Updated variable name from current to current_dir --- cellar/cellar_extractor/operative_extractions.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index 4c46124..44c3942 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -231,7 +231,6 @@ def structure_nine(self) -> list: if "on those grounds" in divs.text.lower(): b = divs.find_all_next('b') for bolds in b: - # print(bolds.text) nine.append(bolds.text) return nine @@ -308,16 +307,15 @@ class Writing(): This class has different methods, for the purpose of writing the operative part into different file formats.(Csv,txt,json) """ - instance: str x: str parameter: str - txt = "txt" - json="json" - csv="csv" - txt_dir = os.path.join(current, txt) - csv_dir = os.path.join(current, csv) - json_dir = os.path.join(current, json) + + current_dir = os.getcwd() + + txt_dir = os.path.join(current_dir, "txt") + csv_dir = os.path.join(current_dir, "csv") + json_dir = os.path.join(current_dir, "json") if not os.path.exists(txt_dir): os.makedirs(txt_dir) @@ -351,7 +349,6 @@ def to_json(self): _file.close() def to_txt(self): - if self.x is not None: _file = open(f"txt/{self.celex}.txt", "a", encoding="utf-8") for w in self.x: From 2537752bbc1b9f6ef384c95cc26f5b160a200cdb Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 14:04:40 +0200 Subject: [PATCH 39/78] Removed conditional statements and updated assert False statements --- tests.py | 77 +++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 48 deletions(-) diff --git a/tests.py b/tests.py index 7296857..f19d69b 100644 --- a/tests.py +++ b/tests.py @@ -73,55 +73,37 @@ def test_cellar_json_n(): except Exception: assert False, "Downloading cellar as json failed." - - - - - def operative_part_csv(celex)->csv: - csv_store=Writing(celex) - csv_store.to_csv() try: - if csv_store.to_csv(): - assert True + csv_store.to_csv() + assert True except Exception: - assert False + assert False, "Downloading and storing as csv failed for operative part" + def operative_part_json(celex)->json: json_store=Writing(celex) - json_store.to_json() try: - if json_store.to_json(): - assert True + json_store.to_json() + assert True except Exception: - assert False + assert False, "Downloading and storing as json failed for operative part" def operative_part_txt(celex): txt_store=Writing(celex) - txt_store.to_txt() try: - if txt_store.to_txt(): - assert True + txt_store.to_txt() + assert True except Exception: - assert False + assert False, "Downloading and storing as txt failed for operative part" def for_operative_part(celex): - - count_fail:int - count_pass=0 - for id in celex_store: - - - - try: - test_output=Analyzer(celex) - test_instance=test_output() - assert True - - - except Exception: - assert False - + try: + test_output=Analyzer(id) + test_output() + assert True + except Exception: + assert False, "Cannot extract for celex" def test_operative_part_txt(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] @@ -129,30 +111,31 @@ def test_operative_part_txt(): choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - if operative_part_txt(celex): - assert True + operative_part_txt(celex) + assert True except Exception: - assert False + assert False, "Cannot extract operative text" + def test_operative_part_json(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - if operative_part_json(celex): - assert True + operative_part_json(celex) + assert True except Exception: - assert False + assert False, "Cannot extract operative text" def test_operative_part_csv(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - if operative_part_csv(celex): - assert True + operative_part_csv(celex): + assert True except Exception: - assert False + assert False, "Cannot extract operative text" def test_for_operative_part(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] @@ -160,10 +143,8 @@ def test_for_operative_part(): choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - if test_for_operative_part(celex): - assert True + test_for_operative_part(celex) + assert True except Exception: - assert False - - + assert False, "Cannot extract operative part" From f82a23f143da0db27ab1330e26cb8b291a2acdb3 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 14:06:25 +0200 Subject: [PATCH 40/78] Removed unnecessary colon --- tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests.py b/tests.py index f19d69b..01548e0 100644 --- a/tests.py +++ b/tests.py @@ -132,7 +132,7 @@ def test_operative_part_csv(): choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - operative_part_csv(celex): + operative_part_csv(celex) assert True except Exception: assert False, "Cannot extract operative text" @@ -146,5 +146,5 @@ def test_for_operative_part(): test_for_operative_part(celex) assert True except Exception: - assert False, "Cannot extract operative part" - + assert False, "Cannot extract operative part" + \ No newline at end of file From 106675f940f2ab7a9d852c5ff6479da9a7608177 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 14:56:49 +0200 Subject: [PATCH 41/78] Removed duplicate __init__ method, reordered import libraries --- cellar/cellar_extractor/operative_extractions.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index 44c3942..b018d9b 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -1,7 +1,7 @@ import csv import json -import requests import os +import requests from bs4 import BeautifulSoup class Analyzer(): @@ -330,10 +330,6 @@ def __init__(self, celex: str): self.instance = Analyzer(self.celex) self.x = self.instance() - def __init__(self, celex: str): - self.celex = celex - self.instance = Analyzer(self.celex) - self.x = self.instance() def to_csv(self): _file = open("output.csv", "a+", encoding="utf-8") From 0dfd3860c1fed94cf0bd84711a3ffd0c3b86ae06 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 14:57:52 +0200 Subject: [PATCH 42/78] Corrected methods being called, extra import from operative_extractions --- tests.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests.py b/tests.py index 01548e0..3557110 100644 --- a/tests.py +++ b/tests.py @@ -1,7 +1,9 @@ -from cellar_extractor import * -import random import csv -import json +import json +import random +from cellar_extractor import * +from cellar_extractor.operative_extractions import * + def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -73,7 +75,7 @@ def test_cellar_json_n(): except Exception: assert False, "Downloading cellar as json failed." -def operative_part_csv(celex)->csv: +def operative_part_csv(celex): csv_store=Writing(celex) try: csv_store.to_csv() @@ -81,7 +83,7 @@ def operative_part_csv(celex)->csv: except Exception: assert False, "Downloading and storing as csv failed for operative part" -def operative_part_json(celex)->json: +def operative_part_json(celex): json_store=Writing(celex) try: json_store.to_json() @@ -99,7 +101,7 @@ def operative_part_txt(celex): def for_operative_part(celex): try: - test_output=Analyzer(id) + test_output=Analyzer(celex) test_output() assert True except Exception: @@ -126,6 +128,7 @@ def test_operative_part_json(): assert True except Exception: assert False, "Cannot extract operative text" + def test_operative_part_csv(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str @@ -143,7 +146,7 @@ def test_for_operative_part(): choice=random.randint(0,len(celex_store)) celex=celex_store[choice] try: - test_for_operative_part(celex) + for_operative_part(celex) assert True except Exception: assert False, "Cannot extract operative part" From 55b117841065454c09c3c35dfb8e0575987cde5b Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 16:03:56 +0200 Subject: [PATCH 43/78] Changed from enumerate to range and len --- cellar/cellar_extractor/operative_extractions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index b018d9b..bd894cb 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -16,6 +16,7 @@ def __init__(self, celex): self.celex = celex self.url = "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A\ {self.celex}&from=EN" + def html_page_structure_one(self) -> list: """ This function retreives operative part from documents of the respected celex id's. @@ -295,13 +296,14 @@ def __call__(self) -> list: self.structure_nine(), self.structure_ten(), self.structure_eleven()] one: list - for funcs in enumerate(container): + for funcs in range(len(container)): one = container[funcs] if one: if (len(one) != 0 or one[0] != "\n"): print("here") return one return None + class Writing(): """ This class has different methods, for the purpose of writing the operative part From abcd5efd3ff0ac1c97867e91cc0a7140f8b54423 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 16:08:25 +0200 Subject: [PATCH 44/78] Updated setup.py file for finding operative_extractions --- cellar/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/setup.py b/cellar/setup.py index eec4dce..adfab07 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -9,7 +9,7 @@ setup( name='cellar_extractor', - packages=find_packages(include=['cellar_extractor']), + packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']), version='1.0.61', description='Library for extracting cellar data', author='LawTech Lab', From 25cdcf1ec0b4ea7af44fbad3f9104f068f7aba05 Mon Sep 17 00:00:00 2001 From: shashankmc Date: Wed, 17 Apr 2024 16:11:12 +0200 Subject: [PATCH 45/78] Correcting path to include everything under cellar_extractor in setup.py file --- cellar/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/setup.py b/cellar/setup.py index adfab07..e20f64f 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -9,7 +9,7 @@ setup( name='cellar_extractor', - packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']), + packages=find_packages(include=['cellar_extractor.*']), version='1.0.61', description='Library for extracting cellar data', author='LawTech Lab', From cd6e55e3949510a987e8bd2c896dbeead9cdf05b Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 17 Apr 2024 17:01:49 +0200 Subject: [PATCH 46/78] F string changes and url changes for eurex website --- .../cellar_extractor/operative_extractions.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index bd894cb..c2edad4 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -9,13 +9,13 @@ class Analyzer(): This class returns a list of the operative part for a given celex id. Celex id is initialized through a constructor. """ - celex: str # declare celex as a string - url: str # declare url as a string + celex: str="" # declare celex as a string + # declare url as a string + url:str="" def __init__(self, celex): # Initialize Celex id as a constructor, passed when calling the class self.celex = celex - self.url = "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A\ - {self.celex}&from=EN" + self.url = f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX%3A{self.celex}&from=EN" def html_page_structure_one(self) -> list: """ @@ -23,7 +23,7 @@ def html_page_structure_one(self) -> list: This function scrapes/parse the operative part from a nested table structure . The relevant text lies inside the coj-bold class of the span tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('table') # Find all tables tag from the website one = [] @@ -49,7 +49,7 @@ def html_page_structure_two(self) -> list: (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') two = [] @@ -68,7 +68,7 @@ def structure_three(self) -> list: This function scrapes/parse the operative part from a nested table structure. The relevant text lies inside the coj-bold class of the span tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') table = parser.find_all('table') three = [] @@ -91,7 +91,7 @@ def structure_four(self) -> list: (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') four = [] @@ -115,7 +115,7 @@ def structure_five(self) -> list: (p) structure. The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') p = parser.find_all('p') five = [] @@ -141,7 +141,7 @@ def structure_six(self) -> list: The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('h2') six = [] @@ -161,7 +161,7 @@ def structure_seven(self) -> list: (table) structure. The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') div = parser.find_all('table') seven = [] @@ -196,7 +196,7 @@ def structure_eight(self) -> list: The text is extracted from the span tag nested inside the tbody tag.Returns a list as output. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') tbody = parser.find_all('tbody') @@ -223,7 +223,7 @@ def structure_nine(self) -> list: The operative part is under the bold(b) tag after the p tag where the keywords "on those grounds" exist. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') nine = [] div = parser.find_all('p') @@ -241,7 +241,7 @@ def structure_eleven(self) -> list: The operative part is under the paragraph(p) tag after the b tag where the keywords "operative part" exist. """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') bold = parser.find_all('b') @@ -264,7 +264,7 @@ def structure_ten(self): operative part is scraped after the occurence of the phrase "On those grounds". """ - website = requests.get(self.url.replace("{self.celex}", self.celex), timeout=60).text + website = requests.get(self.url, timeout=60).text parser = BeautifulSoup(website, 'lxml') appender = [] for string in parser.stripped_strings: @@ -300,10 +300,9 @@ def __call__(self) -> list: one = container[funcs] if one: if (len(one) != 0 or one[0] != "\n"): - print("here") + return one - return None - + class Writing(): """ This class has different methods, for the purpose of writing the operative part @@ -352,4 +351,3 @@ def to_txt(self): for w in self.x: _file.write(w+"\n") _file.close() - From d22849ee5bb05a50dd00b5a2e74628866363591b Mon Sep 17 00:00:00 2001 From: Shashank Date: Wed, 17 Apr 2024 17:13:20 +0200 Subject: [PATCH 47/78] Removed import cellar_extractor.operative_extractions --- tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests.py b/tests.py index 3557110..ce25a4d 100644 --- a/tests.py +++ b/tests.py @@ -2,7 +2,6 @@ import json import random from cellar_extractor import * -from cellar_extractor.operative_extractions import * def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -150,4 +149,4 @@ def test_for_operative_part(): assert True except Exception: assert False, "Cannot extract operative part" - \ No newline at end of file + From d89aaec241af5b4ad60c4b7f0f0f3c43270776d3 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:00:48 +0200 Subject: [PATCH 48/78] os module for tests.py --- tests.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests.py b/tests.py index ce25a4d..54d80a9 100644 --- a/tests.py +++ b/tests.py @@ -1,7 +1,9 @@ -import csv -import json +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) import random -from cellar_extractor import * +from cellar.cellar_extractor.cellar import * +from cellar.cellar_extractor.operative_extractions import * def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From 9aaf2e36ef005ab8a11883084b1e2ec8a114b67f Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:02:00 +0200 Subject: [PATCH 49/78] Include json & csv directory for outputs --- cellar/cellar_extractor/operative_extractions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py index c2edad4..2414a1a 100644 --- a/cellar/cellar_extractor/operative_extractions.py +++ b/cellar/cellar_extractor/operative_extractions.py @@ -333,7 +333,7 @@ def __init__(self, celex: str): def to_csv(self): - _file = open("output.csv", "a+", encoding="utf-8") + _file = open("csv/output.csv", "a+", encoding="utf-8") writer = csv.writer(_file) if self.x is not None: writer.writerow([self.celex, self.x]) @@ -341,7 +341,7 @@ def to_csv(self): def to_json(self): if self.x is not None: data = {'Celex': self.celex, "Operative part": self.x} - _file = open('data.json', 'a+', encoding='utf-8') + _file = open('json/data.json', 'a+', encoding='utf-8') json.dump(data, _file) _file.close() From e33cc9cbed405889d1a019ca7827832584f6b8c7 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:15:15 +0200 Subject: [PATCH 50/78] os configuration to include cellar/ for tests.py --- tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests.py b/tests.py index 54d80a9..e2fd2b7 100644 --- a/tests.py +++ b/tests.py @@ -1,9 +1,10 @@ import sys import os -sys.path.append(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(os.path.join(os.path.dirname(__file__), 'cellar')) + import random -from cellar.cellar_extractor.cellar import * -from cellar.cellar_extractor.operative_extractions import * +from cellar_extractor.cellar import * +from cellar_extractor.operative_extractions import * def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From 483c4199c5bfc1bfb2c3d0253b58d24c063ed5a0 Mon Sep 17 00:00:00 2001 From: Shashank Date: Wed, 24 Apr 2024 10:31:55 +0200 Subject: [PATCH 51/78] Update tests.py Removed os and sys import. --- tests.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests.py b/tests.py index e2fd2b7..e9eb21f 100644 --- a/tests.py +++ b/tests.py @@ -1,10 +1,5 @@ -import sys -import os -sys.path.append(os.path.join(os.path.dirname(__file__), 'cellar')) - import random -from cellar_extractor.cellar import * -from cellar_extractor.operative_extractions import * +from cellar_extractor import * def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From f8887d7c0d1704cb606b262f59a9f6e32805ec47 Mon Sep 17 00:00:00 2001 From: Shashank Date: Wed, 24 Apr 2024 10:33:12 +0200 Subject: [PATCH 52/78] Update setup.py Extended include to include operative_extractions. --- cellar/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/setup.py b/cellar/setup.py index e20f64f..50bab1b 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -9,7 +9,7 @@ setup( name='cellar_extractor', - packages=find_packages(include=['cellar_extractor.*']), + packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']), version='1.0.61', description='Library for extracting cellar data', author='LawTech Lab', @@ -23,4 +23,4 @@ "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries", "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries", }, -) \ No newline at end of file +) From 56f7845b5239561d50c687201286901a174e8a16 Mon Sep 17 00:00:00 2001 From: Shashank Date: Wed, 24 Apr 2024 10:37:15 +0200 Subject: [PATCH 53/78] Change to len(celex_store)-1 to avoid index out of range error. --- tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests.py b/tests.py index e9eb21f..35000e9 100644 --- a/tests.py +++ b/tests.py @@ -107,7 +107,7 @@ def for_operative_part(celex): def test_operative_part_txt(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str - choice=random.randint(0,len(celex_store)) + choice=random.randint(0,len(celex_store)-1) celex=celex_store[choice] try: operative_part_txt(celex) @@ -118,7 +118,7 @@ def test_operative_part_txt(): def test_operative_part_json(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str - choice=random.randint(0,len(celex_store)) + choice=random.randint(0,len(celex_store)-1) celex=celex_store[choice] try: operative_part_json(celex) @@ -129,7 +129,7 @@ def test_operative_part_json(): def test_operative_part_csv(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str - choice=random.randint(0,len(celex_store)) + choice=random.randint(0,len(celex_store)-1) celex=celex_store[choice] try: operative_part_csv(celex) @@ -140,7 +140,7 @@ def test_operative_part_csv(): def test_for_operative_part(): celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] celex:str - choice=random.randint(0,len(celex_store)) + choice=random.randint(0,len(celex_store)-1) celex=celex_store[choice] try: for_operative_part(celex) From 19fccdc7261a87473e9cf595913d78c3e63a88b7 Mon Sep 17 00:00:00 2001 From: Shashank Date: Wed, 24 Apr 2024 10:42:15 +0200 Subject: [PATCH 54/78] Added additional import to test.py file --- tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests.py b/tests.py index 35000e9..c5ddfe3 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,6 @@ import random from cellar_extractor import * +from cellar_extractor.operative_extractions import Analyzer, Writing def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From d57f01543929e0238ba6c25ef0f19180a8180b5c Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:42:15 +0200 Subject: [PATCH 55/78] Create __init__.py --- cellar/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 cellar/__init__.py diff --git a/cellar/__init__.py b/cellar/__init__.py new file mode 100644 index 0000000..0901c57 --- /dev/null +++ b/cellar/__init__.py @@ -0,0 +1 @@ +from cellar_extractor import * From 8045e9c0f856944d9ec2f52f83edcc1b4e717946 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:43:02 +0200 Subject: [PATCH 56/78] Shorten imports after adding __init__.py to cellar folder --- tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests.py b/tests.py index c5ddfe3..d89cec3 100644 --- a/tests.py +++ b/tests.py @@ -1,6 +1,6 @@ import random -from cellar_extractor import * -from cellar_extractor.operative_extractions import Analyzer, Writing +from cellar import * + def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From 26d305d328fed1acaba784ff31c0d0935693d9ba Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:50:51 +0200 Subject: [PATCH 57/78] Add extra import command --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index d89cec3..13118e9 100644 --- a/tests.py +++ b/tests.py @@ -1,6 +1,6 @@ import random from cellar import * - +from cellar.operative_extractions import * def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From 53343819cd92610abe285ec9470acdd1a46f11e9 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:52:03 +0200 Subject: [PATCH 58/78] Update tests.py --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index 13118e9..36c1f38 100644 --- a/tests.py +++ b/tests.py @@ -1,6 +1,6 @@ import random from cellar import * -from cellar.operative_extractions import * +from cellar import Analyzer , Writing def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From e56d32876b0cf438d030db836df6aa1c226a2f27 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:54:07 +0200 Subject: [PATCH 59/78] Update __init__.py --- cellar/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cellar/__init__.py b/cellar/__init__.py index 0901c57..cb1893b 100644 --- a/cellar/__init__.py +++ b/cellar/__init__.py @@ -1 +1,2 @@ from cellar_extractor import * +from cellar_extractor.operative_extractions import Analyzer , Writing From dc9aba3cc81d1b0eba85dd8ead8ae8433cc295b6 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:55:24 +0200 Subject: [PATCH 60/78] Update __init__.py --- cellar/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/__init__.py b/cellar/__init__.py index cb1893b..8e13fda 100644 --- a/cellar/__init__.py +++ b/cellar/__init__.py @@ -1,2 +1,2 @@ from cellar_extractor import * -from cellar_extractor.operative_extractions import Analyzer , Writing +from cellar_extractor import Analyzer , Writing From 1ca331be747978fa0742f15ebd7099520cbb3fbf Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:56:51 +0200 Subject: [PATCH 61/78] Update __init__.py --- cellar/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/__init__.py b/cellar/__init__.py index 8e13fda..09fe356 100644 --- a/cellar/__init__.py +++ b/cellar/__init__.py @@ -1,2 +1,2 @@ from cellar_extractor import * -from cellar_extractor import Analyzer , Writing + From 75b2978ff84b6cd446841f6701cbfc1636e2dfd3 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 09:57:07 +0200 Subject: [PATCH 62/78] Update tests.py --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index 36c1f38..d89cec3 100644 --- a/tests.py +++ b/tests.py @@ -1,6 +1,6 @@ import random from cellar import * -from cellar import Analyzer , Writing + def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) From e4c72364d5834a2b81d4b5ee526de68767783a54 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:01:45 +0200 Subject: [PATCH 63/78] Update __init__.py --- cellar/cellar_extractor/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py index ab2ae68..d07093e 100644 --- a/cellar/cellar_extractor/__init__.py +++ b/cellar/cellar_extractor/__init__.py @@ -2,6 +2,7 @@ from cellar_extractor.cellar import get_cellar_extra from cellar_extractor.cellar import get_nodes_and_edges_lists from cellar_extractor.cellar import filter_subject_matter -from cellar_extractor.operative_extractions import Analyzer,Writing +from cellar_extractor.operative_extractions import Analyzer +from cellar_extractor.operative_extractions import Writing import logging logging.basicConfig(level=logging.INFO) From a7455eefee7b65eb6fd9de897e6e3630787e6938 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:09:13 +0200 Subject: [PATCH 64/78] Update tests.py --- tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.py b/tests.py index d89cec3..cea29f8 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,5 @@ import random -from cellar import * +from cellar_extractor import * def cellar_csv_n(): From 78b71406e61752a67e07a562914a94a7065b99a3 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:16:39 +0200 Subject: [PATCH 65/78] add pip install -e --- .github/workflows/github-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index ea25b00..497ebc4 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install cellar-extractor + pip install -e cellar/cellar_extractor # pip install echr-extractor - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - run: echo "🖥️ The workflow is now ready to test your code on the runner." From 8da60dcd359fc5bdf17de0d3b231bb39c4a6dfec Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:18:26 +0200 Subject: [PATCH 66/78] Update github-actions.yml --- .github/workflows/github-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 497ebc4..d92fc29 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e cellar/cellar_extractor + pip install -e cellar/* # pip install echr-extractor - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - run: echo "🖥️ The workflow is now ready to test your code on the runner." From 25caaf8c1d3baca810aae491b159800172de38ee Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:20:01 +0200 Subject: [PATCH 67/78] Add pip install -e cellar/ --- .github/workflows/github-actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index d92fc29..29bcc92 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e cellar/* + pip install -e cellar/ # pip install echr-extractor - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - run: echo "🖥️ The workflow is now ready to test your code on the runner." From c7bac514234f0090911851513db4ab5d2329d3ef Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 10:36:00 +0200 Subject: [PATCH 68/78] Update README.md --- cellar/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cellar/README.md b/cellar/README.md index acb6b80..2219773 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -37,6 +37,13 @@ Python 3.9 gijsvd + + + venvis +
+ venvis +
+ From 754805448c3ca2077b7c53a19da808b5dfccd0cc Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:24:39 +0200 Subject: [PATCH 69/78] Update README.md --- cellar/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cellar/README.md b/cellar/README.md index 2219773..fc88880 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -66,6 +66,10 @@ Python 3.9 Allows the creation of a network graph of the citations. Can only be returned in-memory.
  • filter_subject_matter
  • Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases. +
  • Analyzer
  • + A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Case law case(En-English only). +
  • Writing
  • + A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
    From f15870cd7d036f099deee0a673a759ce7b9c796c Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:27:40 +0200 Subject: [PATCH 70/78] Update README.md --- cellar/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cellar/README.md b/cellar/README.md index fc88880..d4dddd5 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -69,7 +69,13 @@ Python 3.9
  • Analyzer
  • A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Case law case(En-English only).
  • Writing
  • - A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization). + A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
    + the Writing has three functions :
    +
      +
    • to_csv() - Writes the operative part along with celex id into a csv file
    • +
    • to_json() - Writes the operative part along with celex id into a json file
    • +
    • to_txt() - Writes the operative part along with celex id into a txt file
    • +

    From 15b7b04fe38a60895cb0feebf6613a428777a867 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:28:04 +0200 Subject: [PATCH 71/78] Update README.md --- cellar/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/README.md b/cellar/README.md index d4dddd5..5a0177f 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -70,7 +70,7 @@ Python 3.9 A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Case law case(En-English only).
  • Writing
  • A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
    - the Writing has three functions :
    + the Writing class has three functions :

    • to_csv() - Writes the operative part along with celex id into a csv file
    • to_json() - Writes the operative part along with celex id into a json file
    • From 44bf4b84223a5884344922cabb47d25ef1a8d61f Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:31:57 +0200 Subject: [PATCH 72/78] Update README.md --- cellar/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cellar/README.md b/cellar/README.md index 5a0177f..c32c0c0 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -132,6 +132,15 @@ Python 3.9
    • phrase: string, required, default None
    • The phrase which has to be present in the subject matter of cases. Case insensitive.
    +
  • Analyzer
  • +
      +
    • celex: str, required
    • +
    +
  • Writing
  • +
      +
    • celex: str, required
    • +
    + From aa09b6f9e89902165b1980d9fd64c575cb09eaf1 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:35:02 +0200 Subject: [PATCH 73/78] Update README.md --- cellar/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cellar/README.md b/cellar/README.md index c32c0c0..e4baac2 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -134,11 +134,11 @@ Python 3.9
  • Analyzer
    • -
    • celex: str, required
    • +
    • celex id: str, required
  • Writing
    • -
    • celex: str, required
    • +
    • celex id: str, required
    From 908d5e6b9714c1e30efd18e0552257a0e935f28a Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:48:44 +0200 Subject: [PATCH 74/78] Update README.md --- cellar/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellar/README.md b/cellar/README.md index e4baac2..b4f9d3b 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -135,10 +135,12 @@ Python 3.9
  • Analyzer
    • celex id: str, required
    • +
    • Pass as a constructor upon initializing the class
  • Writing
    • celex id: str, required
    • +
    • Pass as a constructor upon initializing the class
    From 3f00adbff2c651050d307c7915d0b235fd09a307 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:51:46 +0200 Subject: [PATCH 75/78] Add for Analyzer and Writing Classes --- cellar/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/cellar/README.md b/cellar/README.md index b4f9d3b..e54ab47 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -160,7 +160,22 @@ Below are examples for in-memory saving: df = cell.get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=1000) df,json = cell.get_cellar_extra(save_file='n', max_ecli=100, sd='2022-01-01', threads=10) ``` +```python +instance=Analyzer(celex_id:str) +output_list=instance() +print(output_list) +``` +

    Create a callback of the instance of the class initiated and pass a list as it's value.

    + +

    The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called

    +```python +instance=Writing(celex_id:str) +output=instance.to_csv()#for csv +output=instance.to_txt()#for txt +output=instance.to_json()#for json + +``` ## License [![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0) From c92fbef1abd7d119a7ec5aa49db64dccd679726d Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 11:53:19 +0200 Subject: [PATCH 76/78] Add code for Analyzer class --- cellar/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cellar/README.md b/cellar/README.md index e54ab47..455ef5e 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -160,17 +160,21 @@ Below are examples for in-memory saving: df = cell.get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=1000) df,json = cell.get_cellar_extra(save_file='n', max_ecli=100, sd='2022-01-01', threads=10) ``` +

    Create a callback of the instance of the class initiated and pass a list as it's value.

    + ```python -instance=Analyzer(celex_id:str) +import cellar_extractor as cell +instance=cell.Analyzer(celex_id:str) output_list=instance() -print(output_list) +print(output_list) # prints operative part of the Case as a list ``` -

    Create a callback of the instance of the class initiated and pass a list as it's value.

    +

    The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called

    ```python -instance=Writing(celex_id:str) +import cellar_extractor as cell +instance=cell.Writing(celex_id:str) output=instance.to_csv()#for csv output=instance.to_txt()#for txt output=instance.to_json()#for json From 868b14c922a3bfe952a1b6d985588ba2199e4394 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 13:01:27 +0200 Subject: [PATCH 77/78] Update README.md --- cellar/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/README.md b/cellar/README.md index 455ef5e..8b69508 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -147,7 +147,7 @@ Python 3.9 ## Examples -``` +```python import cellar_extractor as cell Below are examples for in-file saving: From 636865188e96e0cb309bb9b9b79a2ad7641b3d81 Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Wed, 1 May 2024 13:51:34 +0200 Subject: [PATCH 78/78] Update README.md --- cellar/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellar/README.md b/cellar/README.md index 8b69508..f5d9d64 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -67,7 +67,7 @@ Python 3.9
  • filter_subject_matter
  • Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases.
  • Analyzer
  • - A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Case law case(En-English only). + A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Court of Justice (CJEU, formerly known as European Court of Justice (ECJ)) judgement (English only).
  • Writing
  • A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
    the Writing class has three functions :