diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
index ea25b00..29bcc92 100644
--- a/.github/workflows/github-actions.yml
+++ b/.github/workflows/github-actions.yml
@@ -17,7 +17,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install cellar-extractor
+ pip install -e cellar/
# pip install echr-extractor
- run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
- run: echo "🖥️ The workflow is now ready to test your code on the runner."
diff --git a/.gitignore b/.gitignore
index aa30a5f..06a87e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-venv
+.venv*
.idea
data
rechtspraak/rechtspraak_extractor/tests/data
@@ -20,4 +20,6 @@ rechtspraak.zip
build.bat
echr_extractor-whl.zip
echr_extractor-whl
-echr_extractor.egg-info
\ No newline at end of file
+echr_extractor.egg-info
+
+.*DS_Store
\ No newline at end of file
diff --git a/cellar/README.md b/cellar/README.md
index acb6b80..f5d9d64 100644
--- a/cellar/README.md
+++ b/cellar/README.md
@@ -37,6 +37,13 @@ Python 3.9
gijsvd
+
+
+
+
+ venvis
+
+ |
@@ -59,6 +66,16 @@ Python 3.9
Allows the creation of a network graph of the citations. Can only be returned in-memory.
filter_subject_matter
Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases.
+ Analyzer
+ A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Court of Justice (CJEU, formerly known as European Court of Justice (ECJ)) judgement (English only).
+ Writing
+ A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
+ the Writing
class has three functions :
+
+ to_csv()
- Writes the operative part along with celex id into a csv file
+ to_json()
- Writes the operative part along with celex id into a json file
+ to_txt()
- Writes the operative part along with celex id into a txt file
+
@@ -115,11 +132,22 @@ Python 3.9
phrase: string, required, default None
The phrase which has to be present in the subject matter of cases. Case insensitive.
+ Analyzer
+
+ - celex id: str, required
+ - Pass as a constructor upon initializing the class
+
+ Writing
+
+ - celex id: str, required
+ - Pass as a constructor upon initializing the class
+
+
## Examples
-```
+```python
import cellar_extractor as cell
Below are examples for in-file saving:
@@ -132,7 +160,26 @@ Below are examples for in-memory saving:
df = cell.get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=1000)
df,json = cell.get_cellar_extra(save_file='n', max_ecli=100, sd='2022-01-01', threads=10)
```
+Create a callback of the instance of the class initiated and pass a list as it's value.
+
+```python
+import cellar_extractor as cell
+instance=cell.Analyzer(celex_id:str)
+output_list=instance()
+print(output_list) # prints operative part of the Case as a list
+```
+
+The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called
+
+```python
+import cellar_extractor as cell
+instance=cell.Writing(celex_id:str)
+output=instance.to_csv()#for csv
+output=instance.to_txt()#for txt
+output=instance.to_json()#for json
+
+```
## License
[![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0)
diff --git a/cellar/__init__.py b/cellar/__init__.py
new file mode 100644
index 0000000..09fe356
--- /dev/null
+++ b/cellar/__init__.py
@@ -0,0 +1,2 @@
+from cellar_extractor import *
+
diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py
deleted file mode 100644
index 412e4e8..0000000
--- a/cellar/cellar_extractor/Testing_file.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""
-
-This file is purely a testing file for trying out separate parts of code, testing if everything works and such.
-Can be also used to develop future code.
-
-
-
-"""
-
-from nodes_and_edges import get_nodes_and_edges
-from os.path import join
-from json_to_csv import read_csv
-import time
-from eurlex_scraping import *
-from cellar import *
-from sparql import *
-
-
-
-
-if __name__ == '__main__':
- celex = "62004CJ0292"
- site = get_entire_page(celex)
- text = get_full_text_from_html(site)
- cits = get_citations_with_extra_info(text)
- print(cits)
- data = get_cellar(sd='2023-01-01',max_ecli=100,save_file='n')
- d3 = filter_subject_matter(data, "prices")
- b=2
- pass
\ No newline at end of file
diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py
index 39184aa..d07093e 100644
--- a/cellar/cellar_extractor/__init__.py
+++ b/cellar/cellar_extractor/__init__.py
@@ -2,5 +2,7 @@
from cellar_extractor.cellar import get_cellar_extra
from cellar_extractor.cellar import get_nodes_and_edges_lists
from cellar_extractor.cellar import filter_subject_matter
+from cellar_extractor.operative_extractions import Analyzer
+from cellar_extractor.operative_extractions import Writing
import logging
-logging.basicConfig(level=logging.INFO)
\ No newline at end of file
+logging.basicConfig(level=logging.INFO)
diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
index b108844..66b3ff5 100644
--- a/cellar/cellar_extractor/cellar.py
+++ b/cellar/cellar_extractor/cellar.py
@@ -11,7 +11,6 @@
from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning
from cellar_extractor.nodes_and_edges import get_nodes_and_edges
-
def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'):
if not ed:
ed = datetime.now().isoformat(timespec='seconds')
@@ -40,7 +39,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
json_to_csv_main(all_eclis, file_path)
else:
file_path = os.path.join('data', file_name + '.json')
- with open(file_path, "w") as f:
+ with open(file_path, "w", encoding="utf-8") as f:
json.dump(all_eclis, f)
else:
if file_format == 'csv':
@@ -51,7 +50,8 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
logging.info("\n--- DONE ---")
-def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""):
+def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01",
+ threads=10, username="", password=""):
if not ed:
ed = datetime.now().isoformat(timespec='seconds')
data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv')
@@ -64,15 +64,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre
file_path = os.path.join('data', file_name + '.csv')
if save_file == 'y':
Path('data').mkdir(parents=True, exist_ok=True)
- extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password)
+ extra_cellar(data=data, filepath=file_path, threads=threads,
+ username=username, password=password)
logging.info("\n--- DONE ---")
else:
- data, json = extra_cellar(data=data, threads=threads, username=username, password=password)
+ data, json_data = extra_cellar(data=data, threads=threads,
+ username=username, password=password)
logging.info("\n--- DONE ---")
- return data, json
-
+ return data,json_data
def get_nodes_and_edges_lists(df=None, only_local=False):
if df is None:
diff --git a/cellar/cellar_extractor/cellar_extra_extract.py b/cellar/cellar_extractor/cellar_extra_extract.py
index 4b6c0eb..81c2764 100644
--- a/cellar/cellar_extractor/cellar_extra_extract.py
+++ b/cellar/cellar_extractor/cellar_extra_extract.py
@@ -4,17 +4,44 @@
def extra_cellar(data=None, filepath=None, threads=10, username="", password=""):
+ """
+ Extracts information from a cellar dataset.
+
+ Args:
+ data (pandas.DataFrame, optional): The input dataset. If not provided,
+ it will be read from the specified filepath.
+ filepath (str, optional): The path to the input dataset file. If provided,
+ the data will be read from this file.
+ threads (int, optional): The number of threads to use for parallel
+ processing. Default is 10.
+ username (str, optional): The username for accessing a separate
+ webservice. Default is an empty string.
+ password (str, optional): The password for accessing a separate
+ webservice. Default is an empty string.
+
+ Returns:
+ tuple: A tuple containing the modified dataset and a JSON object.
+
+ If `data` is not provided, the dataset will be read from the specified
+ `filepath`.
+
+ If `username` and `password` are provided, the function will add
+ citations using a separate webservice.
+
+ The function will add sections to the dataset using the specified
+ number of `threads`. If `filepath` is provided,
+ the modified dataset will be saved to the same file. Otherwise, the
+ modified dataset and a JSON object will be returned.
+ """
if data is None:
data = read_csv(filepath)
if filepath:
if username !="" and password !="":
add_citations_separate_webservice(data, username, password)
- #print("Citations successfully added. The rest of additional extraction will now happen.")
add_sections(data, threads, filepath.replace(".csv", "_fulltext.json"))
data.to_csv(filepath, index=False)
else:
if username != "" and password != "":
add_citations_separate_webservice(data, username, password)
- #print("Citations successfully added. The rest of additional extraction will now happen.")
json = add_sections(data, threads)
return data, json
diff --git a/cellar/cellar_extractor/cellar_queries.py b/cellar/cellar_extractor/cellar_queries.py
index 6c74d16..b0f9e95 100644
--- a/cellar/cellar_extractor/cellar_queries.py
+++ b/cellar/cellar_extractor/cellar_queries.py
@@ -48,18 +48,23 @@ def get_all_eclis(starting_date=None, ending_date=None):
return eclis
-def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, force_readable_vals=False):
+def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True,
+ force_readable_vals=False):
"""Gets cellar metadata
:param eclis: The ECLIs for which to retrieve metadata
:type eclis: list[str]
- :param get_labels: Flag to get human-readable labels for the properties, defaults to True
+ :param get_labels: Flag to get human-readable labels for the properties,
+ defaults to True
:type get_labels: bool, optional
- :param force_readable_cols: Flag to remove any non-labelled properties from the resulting dict, defaults to True
+ :param force_readable_cols: Flag to remove any non-labelled properties
+ from the resulting dict, defaults to True
:type force_readable_cols: bool, optional
- :param force_readable_vals: Flag to remove any non-labelled values from the resulting dict, defaults to False
+ :param force_readable_vals: Flag to remove any non-labelled values from
+ the resulting dict, defaults to False
:type force_readable_vals: bool, optional
- :return: Dictionary containing metadata. Top-level keys are ECLIs, second level are property names
+ :return: Dictionary containing metadata. Top-level keys are ECLIs, second
+ level are property names
:rtype: Dict[str, Dict[str, list[str]]]
"""
@@ -100,8 +105,8 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo
for ecli in eclis:
metadata[ecli] = {}
- # Take each triple, check which source doc it belongs to, key/value pair into its dict derived from the p and o in
- # the query
+ # Take each triple, check which source doc it belongs to, key/value pair
+ # into its dict derived from the p and o in the query
for res in ret['results']['bindings']:
ecli = res['ecli']['value']
# We only want cdm predicates
@@ -125,8 +130,9 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo
else:
val = res['o']['value']
- # We store the values for each property in a list. For some properties this is not necessary,
- # but if a property can be assigned multiple times, this is important. Notable, for example is citations.b
+ # We store the values for each property in a list. For some properties
+ # this is not necessary, but if a property can be assigned multiple
+ # times, this is important. Notable, for example is citations.
if key in metadata[ecli]:
metadata[ecli][key].append(val)
else:
diff --git a/cellar/cellar_extractor/citations_adder.py b/cellar/cellar_extractor/citations_adder.py
index ba47721..861d5cf 100644
--- a/cellar/cellar_extractor/citations_adder.py
+++ b/cellar/cellar_extractor/citations_adder.py
@@ -5,39 +5,39 @@
from io import StringIO
from os.path import dirname, abspath
import pandas as pd
-from cellar_extractor.sparql import get_citations_csv, get_cited, get_citing, run_eurlex_webservice_query
+from cellar_extractor.sparql import (get_citations_csv, get_cited,
+ get_citing, run_eurlex_webservice_query)
from cellar_extractor.eurlex_scraping import extract_dictionary_from_webservice_query
from tqdm import tqdm
sys.path.append(dirname(dirname(dirname(dirname(abspath(__file__))))))
-"""
-Method used by separate threads for the multi-threading method of adding citations to the dataframe
-Sends a query which returns a csv file containing the the celex identifiers of cited works for each case.
-Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query.
-"""
-
def execute_citations(csv_list, citations):
+ """
+ Method used by separate threads for the multi-threading method of adding
+ citations to the dataframe. Sends a query which returns a csv file
+ containing the the celex identifiers of cited works for each case. Works
+ with multi-case queries, at_once is the variable deciding for how many
+ cases are used with each query.
+ """
at_once = 1000
for i in range(0, len(citations), at_once):
new_csv = get_citations_csv(citations[i:(i + at_once)])
csv_list.append(StringIO(new_csv))
-"""
-This method replaces replaces the column with citations.
-
-Old column -> links to cited works
-New column -> celex identifiers of cited works
-
-It uses multithreading, which is very much recommended.
-Uses a query to get the citations in a csv format from the endpoint. *
+def add_citations(data, threads):
+ """
+ This method replaces replaces the column with citations.
-* More details in the query method.
-"""
+ Old column -> links to cited works
+ New column -> celex identifiers of cited works
+ It uses multithreading, which is very much recommended.
+ Uses a query to get the citations in a csv format from the endpoint. *
-def add_citations(data, threads):
+ * More details in the query method.
+ """
name = "WORK CITES WORK. CI / CJ"
celex = data.loc[:, "CELEX IDENTIFIER"]
@@ -68,15 +68,14 @@ def add_citations(data, threads):
citations.sort_index(inplace=True)
data.insert(1, name, citations)
-
-"""
-Method used by separate threads for the multi-threading method of adding citations to the dataframe
-Sends a query which returns a csv file containing the the celex identifiers of cited works for each case.
-Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query.
-"""
-
-
def execute_citations_separate(cited_list, citing_list, citations):
+ """
+ Method used by separate threads for the multi-threading method of
+ adding citations to the dataframe. Sends a query which returns a csv
+ file containing the the celex identifiers of cited works for each case.
+ Works with multi-case queries, at_once is the variable deciding for
+ how many cases are used with each query.
+ """
at_once = 1000
for i in range(0, len(citations), at_once):
new_cited = get_cited(citations[i:(i + at_once)], 1)
@@ -84,17 +83,12 @@ def execute_citations_separate(cited_list, citing_list, citations):
cited_list.append(StringIO(new_cited))
citing_list.append(StringIO(new_citing))
-
-"""
-
-Method used by separate threads for the multi-threading method of adding citations to the dataframe
-Uses the eurlex webservices.
-Also used for the single-thread approach.
-
-"""
-
-
def execute_citations_webservice(dictionary_list, celexes, username, password):
+ """
+ Method used by separate threads for the multi-threading method of
+ adding citations to the dataframe. Uses the eurlex webservices.
+ Also used for the single-thread approach.
+ """
at_once = 100
success=0
retry=0
@@ -103,7 +97,8 @@ def execute_citations_webservice(dictionary_list, celexes, username, password):
normal_celex, contains_celex = clean_celex(celexes)
def process_queries(link, celex):
nonlocal success,retry
- for i in tqdm(range(0, len(celex), at_once), colour="GREEN", position=0, leave=True, maxinterval=10000):
+ for i in tqdm(range(0, len(celex), at_once), colour="GREEN",
+ position=0, leave=True, maxinterval=10000):
curr_celex = celex[i:(i + at_once)]
input=" OR ".join(curr_celex)
query = link % (str(input))
@@ -112,7 +107,8 @@ def process_queries(link, celex):
response = run_eurlex_webservice_query(query, username, password)
if response.status_code == 500 and "WS_WS_CALLS_IDLE_INTERVAL" not in response.text:
perc=i*100/len(celexes)
- logging.info(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." +
+ logging.info(f"Limit of web service usage reached! Citations collection\
+ will stop here at {perc} % of citations downloaded." +
f"\nThere were {success} successful queries and {retry} retries")
return
elif "0" in response.text:
@@ -134,16 +130,13 @@ def process_queries(link, celex):
process_queries(base_contains_query,contains_celex)
-"""
-
-Method used to separate celex id's when there are multiple pointing to the same document.
-On top of that, separates celex id's with '(' and ')', these brackets are keywords for the webservice query.
-After separated, a different query is ran for the normal celexes, and those with brackets.
-
-"""
-
-
def clean_celex(celex):
+ """
+ Method used to separate celex id's when there are multiple pointing to the same document.
+ On top of that, separates celex id's with '(' and ')', these brackets are keywords for the
+ webservice query. After separated, a different query is ran for the normal celexes, and
+ those with brackets.
+ """
normal_list = list()
contains_list = list()
for c1 in celex:
@@ -163,16 +156,14 @@ def clean_celex(celex):
normal_list.append(c1)
return normal_list, contains_list
-
-"""
-
-Method used for creation of a dictionary of documents citing the document.
-Uses the dictionary of documents cited by the document.
-Output will more than likely be bigger than the input dictionary, as it will also include treaties and other documents,
-which are not being extracted by the cellar extractor.
-
-"""
def allowed_id(id):
+ """
+ Method used for creation of a dictionary of documents citing the document.
+ Uses the dictionary of documents cited by the document.
+ Output will more than likely be bigger than the input dictionary,
+ as it will also include treaties and other documents,
+ which are not being extracted by the cellar extractor.
+ """
if id != "":
return id[0] == 8 or id[0] == 6
else:
@@ -190,17 +181,13 @@ def reverse_citing_dict(citing):
cited[c] = k
return cited
-
-"""
-
-Method used to add the dictionaries to the dataframe.
-Used by the citations adding from the eurlex webservices.
-Implements checks, for whether the document whose data we want to add exists in the original dataframe.
-
-"""
-
-
def add_dictionary_to_df(df, dictionary, column_title):
+ """
+ Method used to add the dictionaries to the dataframe.
+ Used by the citations adding from the eurlex webservices.
+ Implements checks, for whether the document whose data we want to add
+ exists in the original dataframe.
+ """
column = pd.Series([], dtype='string')
celex = df.loc[:, "CELEX IDENTIFIER"]
for k in dictionary:
@@ -211,29 +198,28 @@ def add_dictionary_to_df(df, dictionary, column_title):
df.insert(1, column_title, column)
-"""
-Main method for citations adding via eurlex webservices.
-
-Old column -> links to cited works
-New columns -> celex identifiers of cited works and works citing current work
-
-"""
-
-
def add_citations_separate_webservice(data, username, password):
+ """
+ Main method for citations adding via eurlex webservices.
+ Old column -> links to cited works
+ New columns -> celex identifiers of cited works and works citing current work
+ """
celex = data.loc[:, "CELEX IDENTIFIER"]
query = " SELECT CI, DN WHERE DN = 62019CJ0668"
response = run_eurlex_webservice_query(query, username, password)
if response.status_code == 500 :
if "WS_MAXIMUM_NB_OF_WS_CALLS" in response.text:
- logging.warning("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.")
+ logging.warning("Maximum number of calls to the eurlex webservices reached!\
+ The code will skip the citations download.")
return
else:
- logging.warning("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " +
+ logging.warning("Incorrect username and password for eurlex webservices!\
+ (The account login credentials and webservice) " +
"login credentials are different)")
sys.exit(2)
elif response.status_code == 403:
- logging.info("Webservice connection was blocked, eurlex might be going through maintenance right now.")
+ logging.info("Webservice connection was blocked, eurlex might be going\
+ through maintenance right now.")
sys.exit(2)
else:
logging.info("Webservice connection was successful!")
@@ -244,27 +230,26 @@ def add_citations_separate_webservice(data, username, password):
for d in dictionary_list:
citing_dict.update(d)
logging.info("Webservice extraction finished, the rest of extraction will now happen.")
- time.sleep(1) # It seemed to print out the length of dictionary wrong, even when it was equal to 1000.
+ time.sleep(1) # It seemed to print out the length of dictionary wrong,
+ # even when it was equal to 1000.
cited_dict = reverse_citing_dict(citing_dict)
add_dictionary_to_df(data, citing_dict, "citing")
add_dictionary_to_df(data, cited_dict, "cited_by")
+def add_citations_separate(data, threads):
+ """
+ This method replaces replaces the column with citations.
-"""
-This method replaces replaces the column with citations.
-
-Old column -> links to cited works
-New column -> celex identifiers of cited works
-
-It uses multithreading, which is very much recommended.
-Uses a query to get the citations in a csv format from the endpoint. *
+ Old column -> links to cited works
+ New column -> celex identifiers of cited works
-* More details in the query method.
-"""
+ It uses multithreading, which is very much recommended.
+ Uses a query to get the citations in a csv format from the endpoint. *
+ * More details in the query method.
+ """
-def add_citations_separate(data, threads):
celex = data.loc[:, "CELEX IDENTIFIER"]
length = celex.size
if length > 100: # to avoid getting problems with small files
@@ -277,7 +262,8 @@ def add_citations_separate(data, threads):
for i in range(0, length, at_once_threads):
curr_celex = celex[i:(i + at_once_threads)]
- t = threading.Thread(target=execute_citations_separate, args=(cited_csv, citing_csv, curr_celex))
+ t = threading.Thread(target=execute_citations_separate,
+ args=(cited_csv, citing_csv, curr_celex))
threads.append(t)
for t in threads:
diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py
index d6a4527..6d0d5c0 100644
--- a/cellar/cellar_extractor/csv_extractor.py
+++ b/cellar/cellar_extractor/csv_extractor.py
@@ -3,12 +3,11 @@
import logging
from cellar_extractor.json_to_csv import read_csv
-"""
-Method takes in a dataframe and returns a dataframe with only *number* of data rows.
-"""
-
-
def extract_rows(data, number):
+ """
+ Method takes in a dataframe and returns a dataframe with only *number* of data rows.
+ """
+
try:
output = data[1:number]
except Exception:
diff --git a/cellar/cellar_extractor/eurlex_scraping.py b/cellar/cellar_extractor/eurlex_scraping.py
index 944097e..d825ccd 100644
--- a/cellar/cellar_extractor/eurlex_scraping.py
+++ b/cellar/cellar_extractor/eurlex_scraping.py
@@ -1,8 +1,9 @@
-from bs4 import BeautifulSoup
-import requests
import time
-import xmltodict
import re
+import requests
+import xmltodict
+
+from bs4 import BeautifulSoup
LINK_SUMMARY_INF = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN'
LINK_SUMJURE = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere_SUM&from=EN'
@@ -17,31 +18,25 @@
def is_code(word):
return word.replace(".", "0").replace("-", "0")[1:].isdigit()
-
-"""
-Wrapped method for requests.get().
-After 10 retries, it gives up and returns a "404" string.
-"""
-
-
def response_wrapper(link, num=1):
+ """
+ Wrapped method for requests.get().
+ After 10 retries, it gives up and returns a "404" string.
+ """
if num == 10:
return "404"
try:
- response = requests.get(link)
+ response = requests.get(link, timeout=60)
return response
except Exception:
time.sleep(0.5 * num)
return response_wrapper(link, num + 1)
-
-"""
-This method returns the html of a summary page.
-Cellar specific, works for celex id's starting a 6 and 8.
-"""
-
-
def get_summary_html(celex):
+ """
+ This method returns the html of a summary page.
+ Cellar specific, works for celex id's starting a 6 and 8.
+ """
if celex == celex: # nan check
if ";" in celex:
idss = celex.split(";")
@@ -82,18 +77,16 @@ def get_summary_html(celex):
else:
return "No summary available"
-
-"""
-Method used to extract the summary from a html page.
-Cellar specific, uses get_words_from_keywords.
-Currently only walking for celex id's starting with a 6 ( EU cases).
-"""
-
-
def get_summary_from_html(html, starting):
+ """
+ Method used to extract the summary from a html page.
+ Cellar specific, uses get_words_from_keywords.
+ Currently only walking for celex id's starting with a 6 ( EU cases).
+
# This method turns the html code from the summary page into text
# It has different cases depending on the first character of the CELEX ID
# Should only be used for summaries extraction
+ """
text = get_full_text_from_html(html)
if starting == "8":
return "No summary available"
@@ -107,17 +100,14 @@ def get_summary_from_html(html, starting):
return text
return text
-
-"""
-Method used to extract the keywords from a html page.
-Cellar specific, uses get_words_from_keywords.
-"""
-
-
def get_keywords_from_html(html, starting):
+ """
+ Method used to extract the keywords from a html page.
+ Cellar specific, uses get_words_from_keywords.
# This method turns the html code from the summary page into text
# It has different cases depending on the first character of the CELEX ID
# Should only be used for summaries extraction
+ """
text = get_full_text_from_html(html)
if starting == "8":
text = "No keywords available"
@@ -125,22 +115,18 @@ def get_keywords_from_html(html, starting):
elif starting == "6":
return get_words_from_keywords(text)
-
-"""
-
-Method used for citations extraction from eurlex webservices.
-It reads the SOAP response from the webservices, and adds values to the dictionary based on the results.
-Dictionary is using the celex id of a work as key and a list of celex id's of works cited as value.
-
-"""
-
-
def extract_dictionary_from_webservice_query(response):
+ """
+ Method used for citations extraction from eurlex webservices.
+ It reads the SOAP response from the webservices, and adds values to the
+ dictionary based on the results. Dictionary is using the celex id of a
+ work as key and a list of celex id's of works cited as value.
+ """
text = response.text
read = xmltodict.parse(text)
results = read['S:Envelope']['S:Body']['searchResults']['result']
dictionary = dict()
- if type(results) == list:
+ if isinstance(results, list):
for result in results:
celex, citing = extract_citations_from_soap(result)
dictionary[celex] = citing
@@ -149,16 +135,11 @@ def extract_dictionary_from_webservice_query(response):
dictionary[celex] = citing
return dictionary
-
-"""
-
-Method used for citations extraction from eurlex webservices.
-Reads the individual celex id and documents cited from a single result.
-
-"""
-
-
def extract_citations_from_soap(results):
+ """
+ Method used for citations extraction from eurlex webservices.
+ Reads the individual celex id and documents cited from a single result.
+ """
main_content = results['content']['NOTICE']['WORK']
celex = main_content['ID_CELEX'].get('VALUE')
try:
@@ -166,7 +147,7 @@ def extract_citations_from_soap(results):
except KeyError:
return celex, ""
citing_list = list()
- if type(citing) == list:
+ if isinstance(citing, list):
for cited in citing:
celex_of_citation = get_citation_celex(cited)
if celex_of_citation != "":
@@ -175,20 +156,16 @@ def extract_citations_from_soap(results):
else:
return celex, get_citation_celex(citing)
-
-"""
-
-Method used for citations extraction from eurlex webservices.
-Goes thru all of the different id's of the document cited, and returns the one that is a celex id.
-
-"""
-
-
def get_citation_celex(cited):
+ """
+ Method used for citations extraction from eurlex webservices.
+ Goes thru all of the different id's of the document cited,
+ and returns the one that is a celex id.
+ """
identifiers = cited['SAMEAS']
- if type(identifiers) == list:
- for id in identifiers:
- ident = id['URI']['IDENTIFIER']
+ if isinstance(identifiers, list):
+ for _id in identifiers:
+ ident = _id['URI']['IDENTIFIER']
if is_celex_id(ident):
return ident
else:
@@ -197,31 +174,22 @@ def get_citation_celex(cited):
return ident
return ""
-
-"""
-
-Method checking if the id passed is a celex id, using regex.
-
-"""
-
-
-def is_celex_id(id):
- if id is None:
+def is_celex_id(_id):
+ """
+ Method checking if the id passed is a celex id, using regex.
+ """
+ if _id is None:
return False
- if prog.match(id):
+ if prog.match(_id):
return True
else:
return False
-
-"""
-This method tries to extract only they keywords from a part of html page containing it.
-They keywords on the page are always separated by " - " or other types of dashes.
-
-"""
-
-
def get_words_from_keywords_em(text):
+ """
+ This method tries to extract only they keywords from a part of html page containing it.
+ They keywords on the page are always separated by " - " or other types of dashes.
+ """
lines = text.split(sep="\n")
returner = set()
for line in lines:
@@ -242,15 +210,10 @@ def get_words_from_keywords_em(text):
returner.update(line.split(sep=" - "))
return ";".join(returner)
-
-"""
-
-One of the methods used to extract keywords from summary text.
-
-"""
-
-
def get_words_from_keywords(text):
+ """
+ One of the methods used to extract keywords from summary text.
+ """
if "Keywords" in text:
try:
index = text.find("Keywords")
@@ -269,15 +232,12 @@ def get_words_from_keywords(text):
text = text[:index]
return get_words_from_keywords_em(text)
-
-"""
- This method turns the html code from the summary page into text.
- It has different cases depending on the first character of the CELEX ID.
- Universal method, also replaces all "," with "_".
-"""
-
-
def get_full_text_from_html(html_text):
+ """
+ This method turns the html code from the summary page into text.
+ It has different cases depending on the first character of the CELEX ID.
+ Universal method, also replaces all "," with "_".
+ """
# This method turns the html code from the summary page into text
# It has different cases depending on the first character of the CELEX ID
# Should only be used for summaries extraction
@@ -294,15 +254,12 @@ def get_full_text_from_html(html_text):
text = text.replace(",", "_")
return text
-
-"""
-This method is a wrapped for the get_html_by_celex_id method imported from eurlex.
-Sometimes thew websites do not load because of too many connections at once,
-this method waits a bit and tries again for up to 5 tries.
-"""
-
-
def get_html_text_by_celex_id(id):
+ """
+ This method is a wrapped for the get_html_by_celex_id method imported from eurlex.
+ Sometimes thew websites do not load because of too many connections at once,
+ this method waits a bit and tries again for up to 5 tries.
+ """
link = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN"
final = id
if id == id: # nan check
@@ -321,14 +278,12 @@ def get_html_text_by_celex_id(id):
else:
return html.text
-
-"""
-This method gets the page containing all document details for extracting the subject matter and
-the case law directory codes. Uses the celex identifier of a case.
-"""
-
-
def get_entire_page(celex):
+ """
+ This method gets the page containing all document details for extracting
+ the subject matter and
+ the case law directory codes. Uses the celex identifier of a case.
+ """
link = 'https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:cIdHere'
if celex == celex: # nan check
if ";" in celex:
@@ -353,14 +308,11 @@ def get_entire_page(celex):
except Exception:
return "No data available"
-
-"""
-This Method gets the subject matter from a fragment of code containing them.
-Used for extracting subject matter for cellar cases only.
-"""
-
-
def get_subject(text):
+ """
+ This Method gets the subject matter from a fragment of code containing them.
+ Used for extracting subject matter for cellar cases only.
+ """
try:
index_matter = text.index("Subject matter:")
try:
@@ -375,14 +327,11 @@ def get_subject(text):
subject = ""
return subject
-
-"""
-This Method extracts all eurovocs, from a fragment containing them.
-Used for extracting eurovoc for cellar cases.
-"""
-
-
def get_eurovoc(text):
+ """
+ This Method extracts all eurovocs, from a fragment containing them.
+ Used for extracting eurovoc for cellar cases.
+ """
try:
start = text.find("EUROVOC")
try:
@@ -408,14 +357,11 @@ def get_eurovoc(text):
except Exception:
return ""
-
-"""
-Method for getting all of the case directory codes for each cellar case.
-Extracts them from a string containing the eurlex website containing all document information.
-"""
-
-
def get_codes(text):
+ """
+ Method for getting all of the case directory codes for each cellar case.
+ Extracts them from a string containing the eurlex website containing all document information.
+ """
try:
index_codes = text.index("Case law directory code:")
index_end = text.index("Miscellaneous information")
@@ -431,11 +377,8 @@ def get_codes(text):
index_start = indexes[x]
getting_ending = extracting[index_start:]
words_here = getting_ending.split()
-
for words in words_here:
-
if words is not words_here[0]:
-
if is_code(words):
ending = getting_ending[2:].find(words)
done = True
@@ -444,14 +387,12 @@ def get_codes(text):
code_text = getting_ending[:ending]
else:
code_text = getting_ending
-
codes_result.append(code_text.replace("\n", ""))
code = ";".join(codes_result)
except Exception:
code = ""
return code
-
def get_advocate_or_judge(text, phrase):
"""
:param text: full text of the info page of a case from eur-lex website
@@ -475,7 +416,8 @@ def get_advocate_or_judge(text, phrase):
def get_case_affecting(text):
"""
:param text: full text of the info page of a case from eur-lex website
- :return: The celex id's of case affecting listed + entire string data with more information about the case affecting
+ :return: The celex id's of case affecting listed + entire string data with
+ more information about the case affecting
"""
phrase = 'Case affecting:'
try:
@@ -523,6 +465,5 @@ def get_citations_with_extra_info(text):
data_list.append(fixed_line)
else:
return ";".join(data_list)
-
except:
- return ''
+ return ''
\ No newline at end of file
diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py
index 8bb03c3..57f76b4 100644
--- a/cellar/cellar_extractor/fulltext_saving.py
+++ b/cellar/cellar_extractor/fulltext_saving.py
@@ -1,22 +1,22 @@
-import pandas as pd
+import json
import threading
+import time
+import pandas as pd
from cellar_extractor.eurlex_scraping import *
-import json
from tqdm import tqdm
-import time
-
-"""
-This is the method executed by individual threads by the add_sections method.
-
-The big dataset is divided in parts, each thread gets its portion of work to do.
-They add their portions of columns to corresponding lists,
-after all the threads are done the individual parts are put together.
-"""
-def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, list_codes, list_eurovoc, list_adv,
- list_judge, list_affecting_id, list_affecting_str,list_citations_extra, progress_bar):
- sum = pd.Series([], dtype='string')
+def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full,
+ list_codes, list_eurovoc, list_adv, list_judge,
+ list_affecting_id, list_affecting_str,list_citations_extra,
+ progress_bar):
+ """
+ This is the method executed by individual threads by the add_sections method.
+ The big dataset is divided in parts, each thread gets its portion of work to do.
+ They add their portions of columns to corresponding lists,
+ after all the threads are done the individual parts are put together.
+ """
+ _sum = pd.Series([], dtype='string')
key = pd.Series([], dtype='string')
full = list()
case_codes = pd.Series([], dtype='string')
@@ -28,34 +28,34 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full,
citations_extra = pd.Series([], dtype='string')
for i in range(len(celex)):
j = start + i
- id = celex[j]
+ _id = celex[j]
ecli = eclis[j]
- html = get_html_text_by_celex_id(id)
+ html = get_html_text_by_celex_id(_id)
if html != "404":
text = get_full_text_from_html(html)
json_text = {
- 'celex': str(id),
+ 'celex': str(_id),
'ecli': ecli,
'text': text
}
full.append(json_text)
else:
json_text = {
- 'celex': str(id),
+ 'celex': str(_id),
'ecli': ecli,
'text': ""
}
full.append(json_text)
- summary = get_summary_html(id)
+ summary = get_summary_html(_id)
if summary != "No summary available":
text = get_keywords_from_html(summary, id[0])
text2 = get_summary_from_html(summary, id[0])
key[j] = text
- sum[j] = text2
+ _sum[j] = text2
else:
key[j] = ""
- sum[j] = ""
- entire_page = get_entire_page(id)
+ _sum[j] = ""
+ entire_page = get_entire_page(_id)
text = get_full_text_from_html(entire_page)
if entire_page != "No data available":
code = get_codes(text)
@@ -82,7 +82,7 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full,
citations_extra[j] = citation_extra
progress_bar.update(1)
- list_sum.append(sum)
+ list_sum.append(_sum)
list_key.append(key)
list_full.append(full)
list_codes.append(case_codes)
@@ -93,51 +93,53 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full,
list_affecting_str.append(affecting_str)
list_citations_extra.append(citations_extra)
-"""
-This method adds the following sections to a pandas dataframe, as separate columns:
-
-Full Text
-Case law directory codes
-Keywords
-Summary
-Advocate General
-Judge Rapporteur
-Case affecting (CELEX ID)
-Case affecting string (entire str with more info)
-
-Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html.
-It operates with multiple threads, using that feature is recommended as it speeds up the entire process.
-"""
+def add_sections(data, threads, json_filepath=None):
+ """
+ This method adds the following sections to a pandas dataframe, as separate columns:
+ Full Text
+ Case law directory codes
+ Keywords
+ Summary
+ Advocate General
+ Judge Rapporteur
+ Case affecting (CELEX ID)
+ Case affecting string (entire str with more info)
-def add_sections(data, threads, json_filepath=None):
+ Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html.
+ It operates with multiple threads, using that feature is recommended as it speeds up
+ the entire process.
+ """
celex = data.loc[:, 'CELEX IDENTIFIER']
eclis = data.loc[:, 'ECLI']
length = celex.size
time.sleep(1)
- bar = tqdm(total=length, colour="GREEN", miniters=int(length/100), position=0, leave=True, maxinterval=10000)
+ _bar = tqdm(total=length, colour="GREEN", miniters=int(length/100),
+ position=0, leave=True, maxinterval=10000)
if length > threads: # to avoid getting problems with small files
at_once_threads = int(length / threads)
else:
at_once_threads = length
threads = []
- list_sum = list()
- list_key = list()
- list_full = list()
- list_codes = list()
- list_eurovoc = list()
- list_adv = list()
- list_judge = list()
- list_affecting_id = list()
- list_affecting_str = list()
- list_citations_extra = list()
+ list_sum = []
+ list_key = []
+ list_full = []
+ list_codes = []
+ list_eurovoc = []
+ list_adv = []
+ list_judge = []
+ list_affecting_id = []
+ list_affecting_str = []
+ list_citations_extra = []
for i in range(0, length, at_once_threads):
curr_celex = celex[i:(i + at_once_threads)]
curr_ecli = eclis[i:(i + at_once_threads)]
t = threading.Thread(target=execute_sections_threads,
args=(
- curr_celex, curr_ecli, i, list_sum, list_key, list_full, list_codes, list_eurovoc,
- list_adv, list_judge, list_affecting_id, list_affecting_str,list_citations_extra, bar))
+ curr_celex, curr_ecli, i, list_sum, list_key, list_full,
+ list_codes, list_eurovoc, list_adv, list_judge,
+ list_affecting_id, list_affecting_str,
+ list_citations_extra, _bar))
threads.append(t)
for t in threads:
t.start()
@@ -164,13 +166,10 @@ def add_sections(data, threads, json_filepath=None):
json_file.extend(l)
return json_file
-
-"""
-Used for adding columns easier to a dataframe for add_sections().
-"""
-
-
def add_column_frow_list(data, name, list):
+ """
+ Used for adding columns easier to a dataframe for add_sections().
+ """
column = pd.Series([], dtype='string')
for l in list:
column = pd.concat([column,l])
diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py
index 2781ae2..7d757ee 100644
--- a/cellar/cellar_extractor/json_to_csv.py
+++ b/cellar/cellar_extractor/json_to_csv.py
@@ -1,41 +1,45 @@
import csv
import re
+import sys
import warnings
import logging
+from io import StringIO
from bs4 import BeautifulSoup
-import sys
import pandas as pd
-from io import StringIO
+
warnings.filterwarnings("ignore")
-X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT', 'CASE LAW HAS A TYPE OF PROCEDURE',
- 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE', 'CASE LAW USES LANGUAGE OF PROCEDURE',
- 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE', 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT',
+X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT',
+ 'CASE LAW HAS A TYPE OF PROCEDURE', 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE',
+ 'CASE LAW USES LANGUAGE OF PROCEDURE', 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE',
+ 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT',
'CASE LAW ORIGINATES IN COUNTRY OR USES A ROLE QUALIFIER', 'CASE LAW ORIGINATES IN COUNTRY',
- 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'RELATED JOURNAL ARTICLE',
- 'CASE LAW DELIVERED BY ADVOCATE GENERAL', 'CASE LAW DELIVERED BY JUDGE', 'ECLI',
- 'CASE LAW INTERPRETS LEGAL RESOURCE', 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION',
- 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER', 'SECTOR IDENTIFIER',
- 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ',
- 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION',
+ 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER',
+ 'RELATED JOURNAL ARTICLE', 'CASE LAW DELIVERED BY ADVOCATE GENERAL',
+ 'CASE LAW DELIVERED BY JUDGE', 'ECLI', 'CASE LAW INTERPRETS LEGAL RESOURCE',
+ 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION',
+ 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER',
+ 'SECTOR IDENTIFIER', 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE',
+ 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ', 'LEGACY DATE OF CREATION OF WORK',
+ 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION',
'LAST CMR MODIFICATION DATE', 'CASE LAW HAS CONCLUSIONS']
Y = ['LEGAL RESOURCE HAS TYPE OF ACT', 'WORK HAS RESOURCE TYPE', 'CASE LAW ORIGINATES IN COUNTRY',
'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'ECLI', 'REFERENCE TO PROVISIONS OF NATIONAL LAW',
- 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER', 'SECTOR IDENTIFIER',
- 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK IS CREATED BY AGENT (AU)',
- 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE',
- 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT', 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT',
+ 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER',
+ 'SECTOR IDENTIFIER', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE',
+ 'WORK IS CREATED BY AGENT (AU)', 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT',
+ 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE',
+ 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT',
+ 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT',
'CASE LAW BASED ON A LEGAL INSTRUMENT', 'PARTIES OF THE CASE LAW']
COLS = set(X + Y)
COLS = sorted(COLS)
-"""
-Method used after the json to csv conversion, to save the file in the processed directory.
-"""
-
-
def create_csv(filepath, encoding="UTF8", data=None):
+ """
+ Method used after the json to csv conversion, to save the file in the processed directory.
+ """
if data != "":
csv_file = open(filepath, 'w', encoding=encoding)
csv_writer = csv.writer(csv_file)
@@ -43,14 +47,11 @@ def create_csv(filepath, encoding="UTF8", data=None):
csv_writer.writerows(data)
csv_file.close()
-
-"""
-Method used to transform the json file received from cellar_extraction to a csv file.
-Cellar specific, sets specific columns with names defined at the beginning of file as COLS.
-"""
-
-
def json_to_csv(json_data):
+ """
+ Method used to transform the json file received from cellar_extraction to a csv file.
+ Cellar specific, sets specific columns with names defined at the beginning of file as COLS.
+ """
final_data = []
for data in json_data:
ecli_data = json_data[data]
diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py
index 087c798..9cc4b0f 100644
--- a/cellar/cellar_extractor/nodes_and_edges.py
+++ b/cellar/cellar_extractor/nodes_and_edges.py
@@ -1,7 +1,9 @@
import pandas as pd
+
def extract_containing_subject_matter(df,phrase):
returner = df[df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.contains(phrase, na=False)]
return returner
+
def get_df_with_celexes(df,celexes):
returner = df[df['CELEX IDENTIFIER'].isin(celexes)]
return returner
@@ -12,7 +14,7 @@ def get_edges_list(df,only_local):
vals = extraction['citing'].tolist()
nodes = set()
edges = list()
- for i in range(len(keys)):
+ for i in enumerate(keys):
k = keys[i]
val = vals[i]
if val != val:
diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py
new file mode 100644
index 0000000..2414a1a
--- /dev/null
+++ b/cellar/cellar_extractor/operative_extractions.py
@@ -0,0 +1,353 @@
+import csv
+import json
+import os
+import requests
+from bs4 import BeautifulSoup
+
+class Analyzer():
+ """
+ This class returns a list of the operative part for a given celex id.
+ Celex id is initialized through a constructor.
+ """
+ celex: str="" # declare celex as a string
+ # declare url as a string
+ url:str=""
+ def __init__(self, celex):
+ # Initialize Celex id as a constructor, passed when calling the class
+ self.celex = celex
+ self.url = f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX%3A{self.celex}&from=EN"
+
+ def html_page_structure_one(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a nested
+ table structure . The relevant text lies inside the coj-bold class of the span tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ div = parser.find_all('table') # Find all tables tag from the website
+ one = []
+ for divs in div:
+ # Find each nested table within the table
+ table = divs.find('table')
+ if table is not None:
+ # Find all p under the nested table with the coj-normal class
+ p = table.find_all('p', class_="coj-normal")
+ for x in p:
+ # Span class of coj-bold under the p tag
+ span = x.find_all('span', class_="coj-bold")
+ for y in span:
+ if x is not None and y is not None:
+ # append text from span onto a list
+ one.append(y.text)
+ return one
+
+ def html_page_structure_two(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a paragraph
+ (p) structure . The relevant text lies inside the normal class of the p tag which
+ comes after the keyword operative of the previous span tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ p = parser.find_all('p')
+ two = []
+ for para in p:
+ span = para.find('span')
+ if span is not None:
+ if "operative" in span.text.lower():
+ normal = span.find_all_next('p', class_="normal")
+ for op in normal:
+ two.append(op.text)
+ return two
+
+ def structure_three(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a nested
+ table structure. The relevant text lies inside the coj-bold class of the span tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ table = parser.find_all('table')
+ three = []
+ for tables in table:
+ interior = tables.find_all('table')
+ for interiors in interior:
+ if interiors is not None:
+ p = interiors.find_all('p', class_="coj-normal")
+ for x in p:
+ span = x.find_all('span', class_="coj-bold")
+ for y in span:
+ if x is not None and y is not None:
+ three.append(y.text)
+ return three
+
+ def structure_four(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a paragraph
+ (p) structure . The relevant text lies inside the p tag which comes after the
+ keyword operative of the previous span tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ p = parser.find_all('p')
+ four = []
+ for para in p:
+ span = para.find('span')
+ if span is not None:
+ if "operative" in span.text.lower():
+ normal = span.find_all_next('table')
+ for op in normal:
+ tbody = op.find('tbody')
+ new_p = tbody.find_all('p', class_="oj-normal")
+ for subsequent in new_p:
+ if subsequent is not None:
+ four.append(subsequent.text)
+ return four
+
+ def structure_five(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a paragraph
+ (p) structure. The relevant text lies inside the normal class of the p tag which
+ comes after the keyword operative of the previous span tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ p = parser.find_all('p')
+ five = []
+ for para in p:
+
+ span = para.find('span')
+ if span is not None:
+ if "operative" in span.text.lower():
+ normal = span.find_all_next('table')
+ for op in normal:
+ tbody = op.find('tbody')
+ new_p = tbody.find_all('p', class_="normal")
+ for subsequent in new_p:
+ if subsequent is not None:
+ five.append(subsequent.text)
+
+ return five
+
+ def structure_six(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a h2 (header) structure.
+ The relevant text lies inside thee p tag which comes after the keyword operative
+ part of the respective h2 tag.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ div = parser.find_all('h2')
+ six = []
+ for h2 in div:
+ # print(h2.text)
+ if h2.text == "Operative part":
+ operatives = h2.find_all_next('p')
+ for operative in operatives:
+
+ six.append(operative.text)
+ return six
+
+ def structure_seven(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ This function scrapes/parse the operative part from a table
+ (table) structure. The relevant text lies inside the span tag which comes after
+ the p tag , with the class name=normal.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ div = parser.find_all('table')
+ seven = []
+ for divs in div:
+ # find tbody within the table
+ table = divs.find_all('tbody')
+ for tables in table:
+ if tables is not None:
+ # find tr within the tbody
+ p = tables.find_all('tr')
+ for x in p:
+ if x is not None:
+ # find td within the tr
+ td = x.find_all('td')
+ for y in td:
+ if y is not None:
+ p = y.find_all('p', class_="normal")
+ for _all in p:
+ if _all is not None:
+ # find operative part within the span
+ span = _all.find_all(
+ 'span', class_="bold")
+ for spans in span:
+ # Append it into a list and return the
+ # list when the function is called
+ seven.append(spans.text)
+ return seven
+
+ def structure_eight(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ The text is extracted from the span tag nested inside
+ the tbody tag.Returns a list as output.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+
+ tbody = parser.find_all('tbody')
+ eight = []
+ for _all in tbody:
+ if _all is not None:
+ tr = _all.find_all('tr')
+ for trs in tr:
+ if trs is not None:
+ p = parser.find_all('p', class_="normal")
+ for paras in p:
+ if paras is not None:
+ if "on those grounds" in paras.text.lower():
+ span = paras.find_all_next(
+ 'span', class_="bold")
+ for spans in span:
+ if spans is not None:
+ eight.append(spans.text)
+ return eight
+
+ def structure_nine(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ The operative part is under the bold(b)
+ tag after the p tag where the keywords "on those grounds" exist.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ nine = []
+ div = parser.find_all('p')
+ for divs in div:
+ if divs is not None:
+ if "on those grounds" in divs.text.lower():
+ b = divs.find_all_next('b')
+ for bolds in b:
+ nine.append(bolds.text)
+ return nine
+
+ def structure_eleven(self) -> list:
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ The operative part is under the paragraph(p)
+ tag after the b tag where the keywords "operative part" exist.
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ bold = parser.find_all('b')
+
+ eleven = []
+
+ for b in bold:
+ if b is not None:
+ if "operative part" in b.text.lower():
+ tables = b.find_all_next('p')
+ for table in tables:
+ if table is not None:
+ eleven.append(table.text)
+ return eleven
+
+ def structure_ten(self):
+ """
+ This function retreives operative part from documents of the respected celex id's.
+ Since the ocntent is preloaded using js/clients
+ server side functions , the text from the current page is retrieved and the
+ operative part is scraped after the occurence of the phrase
+ "On those grounds".
+ """
+ website = requests.get(self.url, timeout=60).text
+ parser = BeautifulSoup(website, 'lxml')
+ appender = []
+ for string in parser.stripped_strings:
+
+ appender.append(string)
+
+ found = False
+ after_grounds = []
+
+ for x in appender:
+
+ if "on those grounds" in x.lower():
+ found = True
+
+ if found:
+ if len(x.split(" ")) > 3:
+ after_grounds.append(x)
+ return after_grounds
+
+ def __call__(self) -> list:
+ """
+ This inbuilt __call__ function loops through all the methods in the class
+ `Analyzer` and returns the list , with values of the operative part .
+ """
+
+ container = [self.html_page_structure_one(), self.html_page_structure_two(),
+ self.structure_three(), self.structure_four(), self.structure_five(),
+ self.structure_six(), self.structure_seven(), self.structure_eight(),
+ self.structure_nine(), self.structure_ten(), self.structure_eleven()]
+
+ one: list
+ for funcs in range(len(container)):
+ one = container[funcs]
+ if one:
+ if (len(one) != 0 or one[0] != "\n"):
+
+ return one
+
+class Writing():
+ """
+ This class has different methods, for the purpose of writing the operative part
+ into different file formats.(Csv,txt,json)
+ """
+ instance: str
+ x: str
+ parameter: str
+
+ current_dir = os.getcwd()
+
+ txt_dir = os.path.join(current_dir, "txt")
+ csv_dir = os.path.join(current_dir, "csv")
+ json_dir = os.path.join(current_dir, "json")
+
+ if not os.path.exists(txt_dir):
+ os.makedirs(txt_dir)
+ if not os.path.exists(csv_dir):
+ os.makedirs(csv_dir)
+
+ if not os.path.exists(json_dir):
+ os.makedirs(json_dir)
+
+ def __init__(self, celex: str):
+ self.celex = celex
+ self.instance = Analyzer(self.celex)
+ self.x = self.instance()
+
+
+ def to_csv(self):
+ _file = open("csv/output.csv", "a+", encoding="utf-8")
+ writer = csv.writer(_file)
+ if self.x is not None:
+ writer.writerow([self.celex, self.x])
+
+ def to_json(self):
+ if self.x is not None:
+ data = {'Celex': self.celex, "Operative part": self.x}
+ _file = open('json/data.json', 'a+', encoding='utf-8')
+ json.dump(data, _file)
+ _file.close()
+
+ def to_txt(self):
+ if self.x is not None:
+ _file = open(f"txt/{self.celex}.txt", "a", encoding="utf-8")
+ for w in self.x:
+ _file.write(w+"\n")
+ _file.close()
diff --git a/cellar/cellar_extractor/sparql.py b/cellar/cellar_extractor/sparql.py
index de989ce..c41a49c 100644
--- a/cellar/cellar_extractor/sparql.py
+++ b/cellar/cellar_extractor/sparql.py
@@ -1,5 +1,6 @@
from SPARQLWrapper import SPARQLWrapper, JSON, CSV, POST
import requests
+
def run_eurlex_webservice_query(query_input,username,password):
target = "https://eur-lex.europa.eu/EURLexWebService?wsdl"
query = '''
@@ -22,15 +23,12 @@ def run_eurlex_webservice_query(query_input,username,password):
''' % (username, password,query_input)
return requests.request("POST", target, data=query, allow_redirects=True)
-
-"""
-Method acquired from a different law and tech project for getting the citations of a source_celex.
-Unlike get_citations_csv, only works for one source celex at once. Returns a set containing all the works cited by
-the source celex.
-"""
-
def get_citations(source_celex, cites_depth=1, cited_depth=1):
"""
+ Method acquired from a different law and tech project for getting the citations of a
+ source_celex.
+ Unlike get_citations_csv, only works for one source celex at once. Returns a set
+ containing all the works cited by the source celex.
Gets all the citations one to X steps away. Hops can be specified as either
the source document citing another (defined by `cites_depth`) or another document
citing it (`cited_depth`). Any numbers higher than 1 denote that new source document
@@ -69,18 +67,16 @@ def get_citations(source_celex, cites_depth=1, cited_depth=1):
for bind in ret['results']['bindings']:
target = bind['name2']['value']
targets.add(target)
- targets = set([el for el in list(targets)]) # Filters the list. Filter type: '3'=legislation, '6'=case law.
-
+ # Filters the list. Filter type: '3'=legislation, '6'=case law.
+ targets = set([el for el in list(targets)])
return targets
-
-"""
-Method sending a query to the endpoint, which asks for cited works for each celex.
-The celex variable in the method is a list of all the celex identifiers of the cases we need the citations of.
-The query returns a csv, containing all of the data needed."""
-
-
def get_citations_csv(celex):
+ """
+ Method sending a query to the endpoint, which asks for cited works for each celex.
+ The celex variable in the method is a list of all the celex identifiers of the
+ cases we need the citations of.
+ The query returns a csv, containing all of the data needed."""
endpoint = 'https://publications.europa.eu/webapi/rdf/sparql'
input_celex = '", "'.join(celex)
query = '''
diff --git a/cellar/setup.py b/cellar/setup.py
index eec4dce..50bab1b 100644
--- a/cellar/setup.py
+++ b/cellar/setup.py
@@ -9,7 +9,7 @@
setup(
name='cellar_extractor',
- packages=find_packages(include=['cellar_extractor']),
+ packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']),
version='1.0.61',
description='Library for extracting cellar data',
author='LawTech Lab',
@@ -23,4 +23,4 @@
"Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries",
"Build Source": "https://github.com/maastrichtlawtech/extraction_libraries",
},
-)
\ No newline at end of file
+)
diff --git a/tests.py b/tests.py
index 4732cdb..cea29f8 100644
--- a/tests.py
+++ b/tests.py
@@ -1,5 +1,7 @@
+import random
from cellar_extractor import *
+
def cellar_csv_n():
get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100)
@@ -70,3 +72,80 @@ def test_cellar_json_n():
assert True
except Exception:
assert False, "Downloading cellar as json failed."
+
+def operative_part_csv(celex):
+ csv_store=Writing(celex)
+ try:
+ csv_store.to_csv()
+ assert True
+ except Exception:
+ assert False, "Downloading and storing as csv failed for operative part"
+
+def operative_part_json(celex):
+ json_store=Writing(celex)
+ try:
+ json_store.to_json()
+ assert True
+ except Exception:
+ assert False, "Downloading and storing as json failed for operative part"
+
+def operative_part_txt(celex):
+ txt_store=Writing(celex)
+ try:
+ txt_store.to_txt()
+ assert True
+ except Exception:
+ assert False, "Downloading and storing as txt failed for operative part"
+
+def for_operative_part(celex):
+ try:
+ test_output=Analyzer(celex)
+ test_output()
+ assert True
+ except Exception:
+ assert False, "Cannot extract for celex"
+
+def test_operative_part_txt():
+ celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"]
+ celex:str
+ choice=random.randint(0,len(celex_store)-1)
+ celex=celex_store[choice]
+ try:
+ operative_part_txt(celex)
+ assert True
+ except Exception:
+ assert False, "Cannot extract operative text"
+
+def test_operative_part_json():
+ celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"]
+ celex:str
+ choice=random.randint(0,len(celex_store)-1)
+ celex=celex_store[choice]
+ try:
+ operative_part_json(celex)
+ assert True
+ except Exception:
+ assert False, "Cannot extract operative text"
+
+def test_operative_part_csv():
+ celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"]
+ celex:str
+ choice=random.randint(0,len(celex_store)-1)
+ celex=celex_store[choice]
+ try:
+ operative_part_csv(celex)
+ assert True
+ except Exception:
+ assert False, "Cannot extract operative text"
+
+def test_for_operative_part():
+ celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"]
+ celex:str
+ choice=random.randint(0,len(celex_store)-1)
+ celex=celex_store[choice]
+ try:
+ for_operative_part(celex)
+ assert True
+ except Exception:
+ assert False, "Cannot extract operative part"
+