diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index ea25b00..29bcc92 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install cellar-extractor + pip install -e cellar/ # pip install echr-extractor - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." - run: echo "🖥️ The workflow is now ready to test your code on the runner." diff --git a/.gitignore b/.gitignore index aa30a5f..06a87e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -venv +.venv* .idea data rechtspraak/rechtspraak_extractor/tests/data @@ -20,4 +20,6 @@ rechtspraak.zip build.bat echr_extractor-whl.zip echr_extractor-whl -echr_extractor.egg-info \ No newline at end of file +echr_extractor.egg-info + +.*DS_Store \ No newline at end of file diff --git a/cellar/README.md b/cellar/README.md index acb6b80..f5d9d64 100644 --- a/cellar/README.md +++ b/cellar/README.md @@ -37,6 +37,13 @@ Python 3.9 gijsvd + + + venvis +
+ venvis +
+ @@ -59,6 +66,16 @@ Python 3.9 Allows the creation of a network graph of the citations. Can only be returned in-memory.
  • filter_subject_matter
  • Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases. +
  • Analyzer
  • + A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Court of Justice (CJEU, formerly known as European Court of Justice (ECJ)) judgement (English only). +
  • Writing
  • + A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).
    + the Writing class has three functions :

    +
    @@ -115,11 +132,22 @@ Python 3.9
  • phrase: string, required, default None
  • The phrase which has to be present in the subject matter of cases. Case insensitive. +
  • Analyzer
  • + +
  • Writing
  • + + ## Examples -``` +```python import cellar_extractor as cell Below are examples for in-file saving: @@ -132,7 +160,26 @@ Below are examples for in-memory saving: df = cell.get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=1000) df,json = cell.get_cellar_extra(save_file='n', max_ecli=100, sd='2022-01-01', threads=10) ``` +

    Create a callback of the instance of the class initiated and pass a list as it's value.

    + +```python +import cellar_extractor as cell +instance=cell.Analyzer(celex_id:str) +output_list=instance() +print(output_list) # prints operative part of the Case as a list +``` + +

    The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called

    + +```python +import cellar_extractor as cell +instance=cell.Writing(celex_id:str) +output=instance.to_csv()#for csv +output=instance.to_txt()#for txt +output=instance.to_json()#for json + +``` ## License [![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0) diff --git a/cellar/__init__.py b/cellar/__init__.py new file mode 100644 index 0000000..09fe356 --- /dev/null +++ b/cellar/__init__.py @@ -0,0 +1,2 @@ +from cellar_extractor import * + diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py deleted file mode 100644 index 412e4e8..0000000 --- a/cellar/cellar_extractor/Testing_file.py +++ /dev/null @@ -1,30 +0,0 @@ -""" - -This file is purely a testing file for trying out separate parts of code, testing if everything works and such. -Can be also used to develop future code. - - - -""" - -from nodes_and_edges import get_nodes_and_edges -from os.path import join -from json_to_csv import read_csv -import time -from eurlex_scraping import * -from cellar import * -from sparql import * - - - - -if __name__ == '__main__': - celex = "62004CJ0292" - site = get_entire_page(celex) - text = get_full_text_from_html(site) - cits = get_citations_with_extra_info(text) - print(cits) - data = get_cellar(sd='2023-01-01',max_ecli=100,save_file='n') - d3 = filter_subject_matter(data, "prices") - b=2 - pass \ No newline at end of file diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py index 39184aa..d07093e 100644 --- a/cellar/cellar_extractor/__init__.py +++ b/cellar/cellar_extractor/__init__.py @@ -2,5 +2,7 @@ from cellar_extractor.cellar import get_cellar_extra from cellar_extractor.cellar import get_nodes_and_edges_lists from cellar_extractor.cellar import filter_subject_matter +from cellar_extractor.operative_extractions import Analyzer +from cellar_extractor.operative_extractions import Writing import logging -logging.basicConfig(level=logging.INFO) \ No newline at end of file +logging.basicConfig(level=logging.INFO) diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py index b108844..66b3ff5 100644 --- a/cellar/cellar_extractor/cellar.py +++ b/cellar/cellar_extractor/cellar.py @@ -11,7 +11,6 @@ from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning from cellar_extractor.nodes_and_edges import get_nodes_and_edges - def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'): if not ed: ed = datetime.now().isoformat(timespec='seconds') @@ -40,7 +39,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma json_to_csv_main(all_eclis, file_path) else: file_path = os.path.join('data', file_name + '.json') - with open(file_path, "w") as f: + with open(file_path, "w", encoding="utf-8") as f: json.dump(all_eclis, f) else: if file_format == 'csv': @@ -51,7 +50,8 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma logging.info("\n--- DONE ---") -def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""): +def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", + threads=10, username="", password=""): if not ed: ed = datetime.now().isoformat(timespec='seconds') data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv') @@ -64,15 +64,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre file_path = os.path.join('data', file_name + '.csv') if save_file == 'y': Path('data').mkdir(parents=True, exist_ok=True) - extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password) + extra_cellar(data=data, filepath=file_path, threads=threads, + username=username, password=password) logging.info("\n--- DONE ---") else: - data, json = extra_cellar(data=data, threads=threads, username=username, password=password) + data, json_data = extra_cellar(data=data, threads=threads, + username=username, password=password) logging.info("\n--- DONE ---") - return data, json - + return data,json_data def get_nodes_and_edges_lists(df=None, only_local=False): if df is None: diff --git a/cellar/cellar_extractor/cellar_extra_extract.py b/cellar/cellar_extractor/cellar_extra_extract.py index 4b6c0eb..81c2764 100644 --- a/cellar/cellar_extractor/cellar_extra_extract.py +++ b/cellar/cellar_extractor/cellar_extra_extract.py @@ -4,17 +4,44 @@ def extra_cellar(data=None, filepath=None, threads=10, username="", password=""): + """ + Extracts information from a cellar dataset. + + Args: + data (pandas.DataFrame, optional): The input dataset. If not provided, + it will be read from the specified filepath. + filepath (str, optional): The path to the input dataset file. If provided, + the data will be read from this file. + threads (int, optional): The number of threads to use for parallel + processing. Default is 10. + username (str, optional): The username for accessing a separate + webservice. Default is an empty string. + password (str, optional): The password for accessing a separate + webservice. Default is an empty string. + + Returns: + tuple: A tuple containing the modified dataset and a JSON object. + + If `data` is not provided, the dataset will be read from the specified + `filepath`. + + If `username` and `password` are provided, the function will add + citations using a separate webservice. + + The function will add sections to the dataset using the specified + number of `threads`. If `filepath` is provided, + the modified dataset will be saved to the same file. Otherwise, the + modified dataset and a JSON object will be returned. + """ if data is None: data = read_csv(filepath) if filepath: if username !="" and password !="": add_citations_separate_webservice(data, username, password) - #print("Citations successfully added. The rest of additional extraction will now happen.") add_sections(data, threads, filepath.replace(".csv", "_fulltext.json")) data.to_csv(filepath, index=False) else: if username != "" and password != "": add_citations_separate_webservice(data, username, password) - #print("Citations successfully added. The rest of additional extraction will now happen.") json = add_sections(data, threads) return data, json diff --git a/cellar/cellar_extractor/cellar_queries.py b/cellar/cellar_extractor/cellar_queries.py index 6c74d16..b0f9e95 100644 --- a/cellar/cellar_extractor/cellar_queries.py +++ b/cellar/cellar_extractor/cellar_queries.py @@ -48,18 +48,23 @@ def get_all_eclis(starting_date=None, ending_date=None): return eclis -def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, force_readable_vals=False): +def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, + force_readable_vals=False): """Gets cellar metadata :param eclis: The ECLIs for which to retrieve metadata :type eclis: list[str] - :param get_labels: Flag to get human-readable labels for the properties, defaults to True + :param get_labels: Flag to get human-readable labels for the properties, + defaults to True :type get_labels: bool, optional - :param force_readable_cols: Flag to remove any non-labelled properties from the resulting dict, defaults to True + :param force_readable_cols: Flag to remove any non-labelled properties + from the resulting dict, defaults to True :type force_readable_cols: bool, optional - :param force_readable_vals: Flag to remove any non-labelled values from the resulting dict, defaults to False + :param force_readable_vals: Flag to remove any non-labelled values from + the resulting dict, defaults to False :type force_readable_vals: bool, optional - :return: Dictionary containing metadata. Top-level keys are ECLIs, second level are property names + :return: Dictionary containing metadata. Top-level keys are ECLIs, second + level are property names :rtype: Dict[str, Dict[str, list[str]]] """ @@ -100,8 +105,8 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo for ecli in eclis: metadata[ecli] = {} - # Take each triple, check which source doc it belongs to, key/value pair into its dict derived from the p and o in - # the query + # Take each triple, check which source doc it belongs to, key/value pair + # into its dict derived from the p and o in the query for res in ret['results']['bindings']: ecli = res['ecli']['value'] # We only want cdm predicates @@ -125,8 +130,9 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo else: val = res['o']['value'] - # We store the values for each property in a list. For some properties this is not necessary, - # but if a property can be assigned multiple times, this is important. Notable, for example is citations.b + # We store the values for each property in a list. For some properties + # this is not necessary, but if a property can be assigned multiple + # times, this is important. Notable, for example is citations. if key in metadata[ecli]: metadata[ecli][key].append(val) else: diff --git a/cellar/cellar_extractor/citations_adder.py b/cellar/cellar_extractor/citations_adder.py index ba47721..861d5cf 100644 --- a/cellar/cellar_extractor/citations_adder.py +++ b/cellar/cellar_extractor/citations_adder.py @@ -5,39 +5,39 @@ from io import StringIO from os.path import dirname, abspath import pandas as pd -from cellar_extractor.sparql import get_citations_csv, get_cited, get_citing, run_eurlex_webservice_query +from cellar_extractor.sparql import (get_citations_csv, get_cited, + get_citing, run_eurlex_webservice_query) from cellar_extractor.eurlex_scraping import extract_dictionary_from_webservice_query from tqdm import tqdm sys.path.append(dirname(dirname(dirname(dirname(abspath(__file__)))))) -""" -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Sends a query which returns a csv file containing the the celex identifiers of cited works for each case. -Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query. -""" - def execute_citations(csv_list, citations): + """ + Method used by separate threads for the multi-threading method of adding + citations to the dataframe. Sends a query which returns a csv file + containing the the celex identifiers of cited works for each case. Works + with multi-case queries, at_once is the variable deciding for how many + cases are used with each query. + """ at_once = 1000 for i in range(0, len(citations), at_once): new_csv = get_citations_csv(citations[i:(i + at_once)]) csv_list.append(StringIO(new_csv)) -""" -This method replaces replaces the column with citations. - -Old column -> links to cited works -New column -> celex identifiers of cited works - -It uses multithreading, which is very much recommended. -Uses a query to get the citations in a csv format from the endpoint. * +def add_citations(data, threads): + """ + This method replaces replaces the column with citations. -* More details in the query method. -""" + Old column -> links to cited works + New column -> celex identifiers of cited works + It uses multithreading, which is very much recommended. + Uses a query to get the citations in a csv format from the endpoint. * -def add_citations(data, threads): + * More details in the query method. + """ name = "WORK CITES WORK. CI / CJ" celex = data.loc[:, "CELEX IDENTIFIER"] @@ -68,15 +68,14 @@ def add_citations(data, threads): citations.sort_index(inplace=True) data.insert(1, name, citations) - -""" -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Sends a query which returns a csv file containing the the celex identifiers of cited works for each case. -Works with multi-case queries, at_once is the variable deciding for how many cases are used with each query. -""" - - def execute_citations_separate(cited_list, citing_list, citations): + """ + Method used by separate threads for the multi-threading method of + adding citations to the dataframe. Sends a query which returns a csv + file containing the the celex identifiers of cited works for each case. + Works with multi-case queries, at_once is the variable deciding for + how many cases are used with each query. + """ at_once = 1000 for i in range(0, len(citations), at_once): new_cited = get_cited(citations[i:(i + at_once)], 1) @@ -84,17 +83,12 @@ def execute_citations_separate(cited_list, citing_list, citations): cited_list.append(StringIO(new_cited)) citing_list.append(StringIO(new_citing)) - -""" - -Method used by separate threads for the multi-threading method of adding citations to the dataframe -Uses the eurlex webservices. -Also used for the single-thread approach. - -""" - - def execute_citations_webservice(dictionary_list, celexes, username, password): + """ + Method used by separate threads for the multi-threading method of + adding citations to the dataframe. Uses the eurlex webservices. + Also used for the single-thread approach. + """ at_once = 100 success=0 retry=0 @@ -103,7 +97,8 @@ def execute_citations_webservice(dictionary_list, celexes, username, password): normal_celex, contains_celex = clean_celex(celexes) def process_queries(link, celex): nonlocal success,retry - for i in tqdm(range(0, len(celex), at_once), colour="GREEN", position=0, leave=True, maxinterval=10000): + for i in tqdm(range(0, len(celex), at_once), colour="GREEN", + position=0, leave=True, maxinterval=10000): curr_celex = celex[i:(i + at_once)] input=" OR ".join(curr_celex) query = link % (str(input)) @@ -112,7 +107,8 @@ def process_queries(link, celex): response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 and "WS_WS_CALLS_IDLE_INTERVAL" not in response.text: perc=i*100/len(celexes) - logging.info(f"Limit of web service usage reached! Citations collection will stop here at {perc} % of citations downloaded." + + logging.info(f"Limit of web service usage reached! Citations collection\ + will stop here at {perc} % of citations downloaded." + f"\nThere were {success} successful queries and {retry} retries") return elif "0" in response.text: @@ -134,16 +130,13 @@ def process_queries(link, celex): process_queries(base_contains_query,contains_celex) -""" - -Method used to separate celex id's when there are multiple pointing to the same document. -On top of that, separates celex id's with '(' and ')', these brackets are keywords for the webservice query. -After separated, a different query is ran for the normal celexes, and those with brackets. - -""" - - def clean_celex(celex): + """ + Method used to separate celex id's when there are multiple pointing to the same document. + On top of that, separates celex id's with '(' and ')', these brackets are keywords for the + webservice query. After separated, a different query is ran for the normal celexes, and + those with brackets. + """ normal_list = list() contains_list = list() for c1 in celex: @@ -163,16 +156,14 @@ def clean_celex(celex): normal_list.append(c1) return normal_list, contains_list - -""" - -Method used for creation of a dictionary of documents citing the document. -Uses the dictionary of documents cited by the document. -Output will more than likely be bigger than the input dictionary, as it will also include treaties and other documents, -which are not being extracted by the cellar extractor. - -""" def allowed_id(id): + """ + Method used for creation of a dictionary of documents citing the document. + Uses the dictionary of documents cited by the document. + Output will more than likely be bigger than the input dictionary, + as it will also include treaties and other documents, + which are not being extracted by the cellar extractor. + """ if id != "": return id[0] == 8 or id[0] == 6 else: @@ -190,17 +181,13 @@ def reverse_citing_dict(citing): cited[c] = k return cited - -""" - -Method used to add the dictionaries to the dataframe. -Used by the citations adding from the eurlex webservices. -Implements checks, for whether the document whose data we want to add exists in the original dataframe. - -""" - - def add_dictionary_to_df(df, dictionary, column_title): + """ + Method used to add the dictionaries to the dataframe. + Used by the citations adding from the eurlex webservices. + Implements checks, for whether the document whose data we want to add + exists in the original dataframe. + """ column = pd.Series([], dtype='string') celex = df.loc[:, "CELEX IDENTIFIER"] for k in dictionary: @@ -211,29 +198,28 @@ def add_dictionary_to_df(df, dictionary, column_title): df.insert(1, column_title, column) -""" -Main method for citations adding via eurlex webservices. - -Old column -> links to cited works -New columns -> celex identifiers of cited works and works citing current work - -""" - - def add_citations_separate_webservice(data, username, password): + """ + Main method for citations adding via eurlex webservices. + Old column -> links to cited works + New columns -> celex identifiers of cited works and works citing current work + """ celex = data.loc[:, "CELEX IDENTIFIER"] query = " SELECT CI, DN WHERE DN = 62019CJ0668" response = run_eurlex_webservice_query(query, username, password) if response.status_code == 500 : if "WS_MAXIMUM_NB_OF_WS_CALLS" in response.text: - logging.warning("Maximum number of calls to the eurlex webservices reached! The code will skip the citations download.") + logging.warning("Maximum number of calls to the eurlex webservices reached!\ + The code will skip the citations download.") return else: - logging.warning("Incorrect username and password for eurlex webservices! (The account login credentials and webservice) " + + logging.warning("Incorrect username and password for eurlex webservices!\ + (The account login credentials and webservice) " + "login credentials are different)") sys.exit(2) elif response.status_code == 403: - logging.info("Webservice connection was blocked, eurlex might be going through maintenance right now.") + logging.info("Webservice connection was blocked, eurlex might be going\ + through maintenance right now.") sys.exit(2) else: logging.info("Webservice connection was successful!") @@ -244,27 +230,26 @@ def add_citations_separate_webservice(data, username, password): for d in dictionary_list: citing_dict.update(d) logging.info("Webservice extraction finished, the rest of extraction will now happen.") - time.sleep(1) # It seemed to print out the length of dictionary wrong, even when it was equal to 1000. + time.sleep(1) # It seemed to print out the length of dictionary wrong, + # even when it was equal to 1000. cited_dict = reverse_citing_dict(citing_dict) add_dictionary_to_df(data, citing_dict, "citing") add_dictionary_to_df(data, cited_dict, "cited_by") +def add_citations_separate(data, threads): + """ + This method replaces replaces the column with citations. -""" -This method replaces replaces the column with citations. - -Old column -> links to cited works -New column -> celex identifiers of cited works - -It uses multithreading, which is very much recommended. -Uses a query to get the citations in a csv format from the endpoint. * + Old column -> links to cited works + New column -> celex identifiers of cited works -* More details in the query method. -""" + It uses multithreading, which is very much recommended. + Uses a query to get the citations in a csv format from the endpoint. * + * More details in the query method. + """ -def add_citations_separate(data, threads): celex = data.loc[:, "CELEX IDENTIFIER"] length = celex.size if length > 100: # to avoid getting problems with small files @@ -277,7 +262,8 @@ def add_citations_separate(data, threads): for i in range(0, length, at_once_threads): curr_celex = celex[i:(i + at_once_threads)] - t = threading.Thread(target=execute_citations_separate, args=(cited_csv, citing_csv, curr_celex)) + t = threading.Thread(target=execute_citations_separate, + args=(cited_csv, citing_csv, curr_celex)) threads.append(t) for t in threads: diff --git a/cellar/cellar_extractor/csv_extractor.py b/cellar/cellar_extractor/csv_extractor.py index d6a4527..6d0d5c0 100644 --- a/cellar/cellar_extractor/csv_extractor.py +++ b/cellar/cellar_extractor/csv_extractor.py @@ -3,12 +3,11 @@ import logging from cellar_extractor.json_to_csv import read_csv -""" -Method takes in a dataframe and returns a dataframe with only *number* of data rows. -""" - - def extract_rows(data, number): + """ + Method takes in a dataframe and returns a dataframe with only *number* of data rows. + """ + try: output = data[1:number] except Exception: diff --git a/cellar/cellar_extractor/eurlex_scraping.py b/cellar/cellar_extractor/eurlex_scraping.py index 944097e..d825ccd 100644 --- a/cellar/cellar_extractor/eurlex_scraping.py +++ b/cellar/cellar_extractor/eurlex_scraping.py @@ -1,8 +1,9 @@ -from bs4 import BeautifulSoup -import requests import time -import xmltodict import re +import requests +import xmltodict + +from bs4 import BeautifulSoup LINK_SUMMARY_INF = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN' LINK_SUMJURE = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere_SUM&from=EN' @@ -17,31 +18,25 @@ def is_code(word): return word.replace(".", "0").replace("-", "0")[1:].isdigit() - -""" -Wrapped method for requests.get(). -After 10 retries, it gives up and returns a "404" string. -""" - - def response_wrapper(link, num=1): + """ + Wrapped method for requests.get(). + After 10 retries, it gives up and returns a "404" string. + """ if num == 10: return "404" try: - response = requests.get(link) + response = requests.get(link, timeout=60) return response except Exception: time.sleep(0.5 * num) return response_wrapper(link, num + 1) - -""" -This method returns the html of a summary page. -Cellar specific, works for celex id's starting a 6 and 8. -""" - - def get_summary_html(celex): + """ + This method returns the html of a summary page. + Cellar specific, works for celex id's starting a 6 and 8. + """ if celex == celex: # nan check if ";" in celex: idss = celex.split(";") @@ -82,18 +77,16 @@ def get_summary_html(celex): else: return "No summary available" - -""" -Method used to extract the summary from a html page. -Cellar specific, uses get_words_from_keywords. -Currently only walking for celex id's starting with a 6 ( EU cases). -""" - - def get_summary_from_html(html, starting): + """ + Method used to extract the summary from a html page. + Cellar specific, uses get_words_from_keywords. + Currently only walking for celex id's starting with a 6 ( EU cases). + # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction + """ text = get_full_text_from_html(html) if starting == "8": return "No summary available" @@ -107,17 +100,14 @@ def get_summary_from_html(html, starting): return text return text - -""" -Method used to extract the keywords from a html page. -Cellar specific, uses get_words_from_keywords. -""" - - def get_keywords_from_html(html, starting): + """ + Method used to extract the keywords from a html page. + Cellar specific, uses get_words_from_keywords. # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction + """ text = get_full_text_from_html(html) if starting == "8": text = "No keywords available" @@ -125,22 +115,18 @@ def get_keywords_from_html(html, starting): elif starting == "6": return get_words_from_keywords(text) - -""" - -Method used for citations extraction from eurlex webservices. -It reads the SOAP response from the webservices, and adds values to the dictionary based on the results. -Dictionary is using the celex id of a work as key and a list of celex id's of works cited as value. - -""" - - def extract_dictionary_from_webservice_query(response): + """ + Method used for citations extraction from eurlex webservices. + It reads the SOAP response from the webservices, and adds values to the + dictionary based on the results. Dictionary is using the celex id of a + work as key and a list of celex id's of works cited as value. + """ text = response.text read = xmltodict.parse(text) results = read['S:Envelope']['S:Body']['searchResults']['result'] dictionary = dict() - if type(results) == list: + if isinstance(results, list): for result in results: celex, citing = extract_citations_from_soap(result) dictionary[celex] = citing @@ -149,16 +135,11 @@ def extract_dictionary_from_webservice_query(response): dictionary[celex] = citing return dictionary - -""" - -Method used for citations extraction from eurlex webservices. -Reads the individual celex id and documents cited from a single result. - -""" - - def extract_citations_from_soap(results): + """ + Method used for citations extraction from eurlex webservices. + Reads the individual celex id and documents cited from a single result. + """ main_content = results['content']['NOTICE']['WORK'] celex = main_content['ID_CELEX'].get('VALUE') try: @@ -166,7 +147,7 @@ def extract_citations_from_soap(results): except KeyError: return celex, "" citing_list = list() - if type(citing) == list: + if isinstance(citing, list): for cited in citing: celex_of_citation = get_citation_celex(cited) if celex_of_citation != "": @@ -175,20 +156,16 @@ def extract_citations_from_soap(results): else: return celex, get_citation_celex(citing) - -""" - -Method used for citations extraction from eurlex webservices. -Goes thru all of the different id's of the document cited, and returns the one that is a celex id. - -""" - - def get_citation_celex(cited): + """ + Method used for citations extraction from eurlex webservices. + Goes thru all of the different id's of the document cited, + and returns the one that is a celex id. + """ identifiers = cited['SAMEAS'] - if type(identifiers) == list: - for id in identifiers: - ident = id['URI']['IDENTIFIER'] + if isinstance(identifiers, list): + for _id in identifiers: + ident = _id['URI']['IDENTIFIER'] if is_celex_id(ident): return ident else: @@ -197,31 +174,22 @@ def get_citation_celex(cited): return ident return "" - -""" - -Method checking if the id passed is a celex id, using regex. - -""" - - -def is_celex_id(id): - if id is None: +def is_celex_id(_id): + """ + Method checking if the id passed is a celex id, using regex. + """ + if _id is None: return False - if prog.match(id): + if prog.match(_id): return True else: return False - -""" -This method tries to extract only they keywords from a part of html page containing it. -They keywords on the page are always separated by " - " or other types of dashes. - -""" - - def get_words_from_keywords_em(text): + """ + This method tries to extract only they keywords from a part of html page containing it. + They keywords on the page are always separated by " - " or other types of dashes. + """ lines = text.split(sep="\n") returner = set() for line in lines: @@ -242,15 +210,10 @@ def get_words_from_keywords_em(text): returner.update(line.split(sep=" - ")) return ";".join(returner) - -""" - -One of the methods used to extract keywords from summary text. - -""" - - def get_words_from_keywords(text): + """ + One of the methods used to extract keywords from summary text. + """ if "Keywords" in text: try: index = text.find("Keywords") @@ -269,15 +232,12 @@ def get_words_from_keywords(text): text = text[:index] return get_words_from_keywords_em(text) - -""" - This method turns the html code from the summary page into text. - It has different cases depending on the first character of the CELEX ID. - Universal method, also replaces all "," with "_". -""" - - def get_full_text_from_html(html_text): + """ + This method turns the html code from the summary page into text. + It has different cases depending on the first character of the CELEX ID. + Universal method, also replaces all "," with "_". + """ # This method turns the html code from the summary page into text # It has different cases depending on the first character of the CELEX ID # Should only be used for summaries extraction @@ -294,15 +254,12 @@ def get_full_text_from_html(html_text): text = text.replace(",", "_") return text - -""" -This method is a wrapped for the get_html_by_celex_id method imported from eurlex. -Sometimes thew websites do not load because of too many connections at once, -this method waits a bit and tries again for up to 5 tries. -""" - - def get_html_text_by_celex_id(id): + """ + This method is a wrapped for the get_html_by_celex_id method imported from eurlex. + Sometimes thew websites do not load because of too many connections at once, + this method waits a bit and tries again for up to 5 tries. + """ link = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:cIdHere&from=EN" final = id if id == id: # nan check @@ -321,14 +278,12 @@ def get_html_text_by_celex_id(id): else: return html.text - -""" -This method gets the page containing all document details for extracting the subject matter and -the case law directory codes. Uses the celex identifier of a case. -""" - - def get_entire_page(celex): + """ + This method gets the page containing all document details for extracting + the subject matter and + the case law directory codes. Uses the celex identifier of a case. + """ link = 'https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:cIdHere' if celex == celex: # nan check if ";" in celex: @@ -353,14 +308,11 @@ def get_entire_page(celex): except Exception: return "No data available" - -""" -This Method gets the subject matter from a fragment of code containing them. -Used for extracting subject matter for cellar cases only. -""" - - def get_subject(text): + """ + This Method gets the subject matter from a fragment of code containing them. + Used for extracting subject matter for cellar cases only. + """ try: index_matter = text.index("Subject matter:") try: @@ -375,14 +327,11 @@ def get_subject(text): subject = "" return subject - -""" -This Method extracts all eurovocs, from a fragment containing them. -Used for extracting eurovoc for cellar cases. -""" - - def get_eurovoc(text): + """ + This Method extracts all eurovocs, from a fragment containing them. + Used for extracting eurovoc for cellar cases. + """ try: start = text.find("EUROVOC") try: @@ -408,14 +357,11 @@ def get_eurovoc(text): except Exception: return "" - -""" -Method for getting all of the case directory codes for each cellar case. -Extracts them from a string containing the eurlex website containing all document information. -""" - - def get_codes(text): + """ + Method for getting all of the case directory codes for each cellar case. + Extracts them from a string containing the eurlex website containing all document information. + """ try: index_codes = text.index("Case law directory code:") index_end = text.index("Miscellaneous information") @@ -431,11 +377,8 @@ def get_codes(text): index_start = indexes[x] getting_ending = extracting[index_start:] words_here = getting_ending.split() - for words in words_here: - if words is not words_here[0]: - if is_code(words): ending = getting_ending[2:].find(words) done = True @@ -444,14 +387,12 @@ def get_codes(text): code_text = getting_ending[:ending] else: code_text = getting_ending - codes_result.append(code_text.replace("\n", "")) code = ";".join(codes_result) except Exception: code = "" return code - def get_advocate_or_judge(text, phrase): """ :param text: full text of the info page of a case from eur-lex website @@ -475,7 +416,8 @@ def get_advocate_or_judge(text, phrase): def get_case_affecting(text): """ :param text: full text of the info page of a case from eur-lex website - :return: The celex id's of case affecting listed + entire string data with more information about the case affecting + :return: The celex id's of case affecting listed + entire string data with + more information about the case affecting """ phrase = 'Case affecting:' try: @@ -523,6 +465,5 @@ def get_citations_with_extra_info(text): data_list.append(fixed_line) else: return ";".join(data_list) - except: - return '' + return '' \ No newline at end of file diff --git a/cellar/cellar_extractor/fulltext_saving.py b/cellar/cellar_extractor/fulltext_saving.py index 8bb03c3..57f76b4 100644 --- a/cellar/cellar_extractor/fulltext_saving.py +++ b/cellar/cellar_extractor/fulltext_saving.py @@ -1,22 +1,22 @@ -import pandas as pd +import json import threading +import time +import pandas as pd from cellar_extractor.eurlex_scraping import * -import json from tqdm import tqdm -import time - -""" -This is the method executed by individual threads by the add_sections method. - -The big dataset is divided in parts, each thread gets its portion of work to do. -They add their portions of columns to corresponding lists, -after all the threads are done the individual parts are put together. -""" -def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, list_codes, list_eurovoc, list_adv, - list_judge, list_affecting_id, list_affecting_str,list_citations_extra, progress_bar): - sum = pd.Series([], dtype='string') +def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, + list_codes, list_eurovoc, list_adv, list_judge, + list_affecting_id, list_affecting_str,list_citations_extra, + progress_bar): + """ + This is the method executed by individual threads by the add_sections method. + The big dataset is divided in parts, each thread gets its portion of work to do. + They add their portions of columns to corresponding lists, + after all the threads are done the individual parts are put together. + """ + _sum = pd.Series([], dtype='string') key = pd.Series([], dtype='string') full = list() case_codes = pd.Series([], dtype='string') @@ -28,34 +28,34 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, citations_extra = pd.Series([], dtype='string') for i in range(len(celex)): j = start + i - id = celex[j] + _id = celex[j] ecli = eclis[j] - html = get_html_text_by_celex_id(id) + html = get_html_text_by_celex_id(_id) if html != "404": text = get_full_text_from_html(html) json_text = { - 'celex': str(id), + 'celex': str(_id), 'ecli': ecli, 'text': text } full.append(json_text) else: json_text = { - 'celex': str(id), + 'celex': str(_id), 'ecli': ecli, 'text': "" } full.append(json_text) - summary = get_summary_html(id) + summary = get_summary_html(_id) if summary != "No summary available": text = get_keywords_from_html(summary, id[0]) text2 = get_summary_from_html(summary, id[0]) key[j] = text - sum[j] = text2 + _sum[j] = text2 else: key[j] = "" - sum[j] = "" - entire_page = get_entire_page(id) + _sum[j] = "" + entire_page = get_entire_page(_id) text = get_full_text_from_html(entire_page) if entire_page != "No data available": code = get_codes(text) @@ -82,7 +82,7 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, citations_extra[j] = citation_extra progress_bar.update(1) - list_sum.append(sum) + list_sum.append(_sum) list_key.append(key) list_full.append(full) list_codes.append(case_codes) @@ -93,51 +93,53 @@ def execute_sections_threads(celex, eclis, start, list_sum, list_key, list_full, list_affecting_str.append(affecting_str) list_citations_extra.append(citations_extra) -""" -This method adds the following sections to a pandas dataframe, as separate columns: - -Full Text -Case law directory codes -Keywords -Summary -Advocate General -Judge Rapporteur -Case affecting (CELEX ID) -Case affecting string (entire str with more info) - -Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html. -It operates with multiple threads, using that feature is recommended as it speeds up the entire process. -""" +def add_sections(data, threads, json_filepath=None): + """ + This method adds the following sections to a pandas dataframe, as separate columns: + Full Text + Case law directory codes + Keywords + Summary + Advocate General + Judge Rapporteur + Case affecting (CELEX ID) + Case affecting string (entire str with more info) -def add_sections(data, threads, json_filepath=None): + Method is cellar-specific, scraping html from https://eur-lex.europa.eu/homepage.html. + It operates with multiple threads, using that feature is recommended as it speeds up + the entire process. + """ celex = data.loc[:, 'CELEX IDENTIFIER'] eclis = data.loc[:, 'ECLI'] length = celex.size time.sleep(1) - bar = tqdm(total=length, colour="GREEN", miniters=int(length/100), position=0, leave=True, maxinterval=10000) + _bar = tqdm(total=length, colour="GREEN", miniters=int(length/100), + position=0, leave=True, maxinterval=10000) if length > threads: # to avoid getting problems with small files at_once_threads = int(length / threads) else: at_once_threads = length threads = [] - list_sum = list() - list_key = list() - list_full = list() - list_codes = list() - list_eurovoc = list() - list_adv = list() - list_judge = list() - list_affecting_id = list() - list_affecting_str = list() - list_citations_extra = list() + list_sum = [] + list_key = [] + list_full = [] + list_codes = [] + list_eurovoc = [] + list_adv = [] + list_judge = [] + list_affecting_id = [] + list_affecting_str = [] + list_citations_extra = [] for i in range(0, length, at_once_threads): curr_celex = celex[i:(i + at_once_threads)] curr_ecli = eclis[i:(i + at_once_threads)] t = threading.Thread(target=execute_sections_threads, args=( - curr_celex, curr_ecli, i, list_sum, list_key, list_full, list_codes, list_eurovoc, - list_adv, list_judge, list_affecting_id, list_affecting_str,list_citations_extra, bar)) + curr_celex, curr_ecli, i, list_sum, list_key, list_full, + list_codes, list_eurovoc, list_adv, list_judge, + list_affecting_id, list_affecting_str, + list_citations_extra, _bar)) threads.append(t) for t in threads: t.start() @@ -164,13 +166,10 @@ def add_sections(data, threads, json_filepath=None): json_file.extend(l) return json_file - -""" -Used for adding columns easier to a dataframe for add_sections(). -""" - - def add_column_frow_list(data, name, list): + """ + Used for adding columns easier to a dataframe for add_sections(). + """ column = pd.Series([], dtype='string') for l in list: column = pd.concat([column,l]) diff --git a/cellar/cellar_extractor/json_to_csv.py b/cellar/cellar_extractor/json_to_csv.py index 2781ae2..7d757ee 100644 --- a/cellar/cellar_extractor/json_to_csv.py +++ b/cellar/cellar_extractor/json_to_csv.py @@ -1,41 +1,45 @@ import csv import re +import sys import warnings import logging +from io import StringIO from bs4 import BeautifulSoup -import sys import pandas as pd -from io import StringIO + warnings.filterwarnings("ignore") -X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT', 'CASE LAW HAS A TYPE OF PROCEDURE', - 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE', 'CASE LAW USES LANGUAGE OF PROCEDURE', - 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE', 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT', +X = ['WORK IS CREATED BY AGENT (AU)', 'CASE LAW COMMENTED BY AGENT', + 'CASE LAW HAS A TYPE OF PROCEDURE', 'LEGAL RESOURCE USES ORIGINALLY LANGUAGE', + 'CASE LAW USES LANGUAGE OF PROCEDURE', 'CASE LAW HAS A JUDICIAL PROCEDURE TYPE', + 'WORK HAS RESOURCE TYPE', 'LEGAL RESOURCE BASED ON TREATY CONCEPT', 'CASE LAW ORIGINATES IN COUNTRY OR USES A ROLE QUALIFIER', 'CASE LAW ORIGINATES IN COUNTRY', - 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'RELATED JOURNAL ARTICLE', - 'CASE LAW DELIVERED BY ADVOCATE GENERAL', 'CASE LAW DELIVERED BY JUDGE', 'ECLI', - 'CASE LAW INTERPRETS LEGAL RESOURCE', 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION', - 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER', 'SECTOR IDENTIFIER', - 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ', - 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION', + 'CASE LAW DELIVERED BY COURT FORMATION', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', + 'RELATED JOURNAL ARTICLE', 'CASE LAW DELIVERED BY ADVOCATE GENERAL', + 'CASE LAW DELIVERED BY JUDGE', 'ECLI', 'CASE LAW INTERPRETS LEGAL RESOURCE', + 'NATIONAL JUDGEMENT', 'DATE_CREATION_LEGACY', 'DATETIME NEGOTIATION', + 'SEQUENCE OF VALUES', 'DATE OF REQUEST FOR AN OPINION', 'CELEX IDENTIFIER', + 'SECTOR IDENTIFIER', 'NATURAL NUMBER (CELEX)', 'TYPE OF LEGAL RESOURCE', + 'YEAR OF THE LEGAL RESOURCE', 'WORK CITES WORK. CI / CJ', 'LEGACY DATE OF CREATION OF WORK', + 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK VERSION', 'LAST CMR MODIFICATION DATE', 'CASE LAW HAS CONCLUSIONS'] Y = ['LEGAL RESOURCE HAS TYPE OF ACT', 'WORK HAS RESOURCE TYPE', 'CASE LAW ORIGINATES IN COUNTRY', 'LEGAL RESOURCE IS ABOUT SUBJECT MATTER', 'ECLI', 'REFERENCE TO PROVISIONS OF NATIONAL LAW', - 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER', 'SECTOR IDENTIFIER', - 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', 'WORK IS CREATED BY AGENT (AU)', - 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE', - 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT', 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT', + 'PUBLICATION REFERENCE OF COURT DECISION', 'CELEX IDENTIFIER', 'LOCAL IDENTIFIER', + 'SECTOR IDENTIFIER', 'TYPE OF LEGAL RESOURCE', 'YEAR OF THE LEGAL RESOURCE', + 'WORK IS CREATED BY AGENT (AU)', 'LEGACY DATE OF CREATION OF WORK', 'DATE OF DOCUMENT', + 'IDENTIFIER OF DOCUMENT', 'WORK TITLE', 'CMR CREATION DATE', + 'LAST CMR MODIFICATION DATE', 'CASE LAW DELIVERED BY NATIONAL COURT', + 'REFERENCE TO A EUROPEAN ACT IN FREE TEXT', 'CASE LAW BASED ON A LEGAL INSTRUMENT', 'PARTIES OF THE CASE LAW'] COLS = set(X + Y) COLS = sorted(COLS) -""" -Method used after the json to csv conversion, to save the file in the processed directory. -""" - - def create_csv(filepath, encoding="UTF8", data=None): + """ + Method used after the json to csv conversion, to save the file in the processed directory. + """ if data != "": csv_file = open(filepath, 'w', encoding=encoding) csv_writer = csv.writer(csv_file) @@ -43,14 +47,11 @@ def create_csv(filepath, encoding="UTF8", data=None): csv_writer.writerows(data) csv_file.close() - -""" -Method used to transform the json file received from cellar_extraction to a csv file. -Cellar specific, sets specific columns with names defined at the beginning of file as COLS. -""" - - def json_to_csv(json_data): + """ + Method used to transform the json file received from cellar_extraction to a csv file. + Cellar specific, sets specific columns with names defined at the beginning of file as COLS. + """ final_data = [] for data in json_data: ecli_data = json_data[data] diff --git a/cellar/cellar_extractor/nodes_and_edges.py b/cellar/cellar_extractor/nodes_and_edges.py index 087c798..9cc4b0f 100644 --- a/cellar/cellar_extractor/nodes_and_edges.py +++ b/cellar/cellar_extractor/nodes_and_edges.py @@ -1,7 +1,9 @@ import pandas as pd + def extract_containing_subject_matter(df,phrase): returner = df[df["LEGAL RESOURCE IS ABOUT SUBJECT MATTER"].str.contains(phrase, na=False)] return returner + def get_df_with_celexes(df,celexes): returner = df[df['CELEX IDENTIFIER'].isin(celexes)] return returner @@ -12,7 +14,7 @@ def get_edges_list(df,only_local): vals = extraction['citing'].tolist() nodes = set() edges = list() - for i in range(len(keys)): + for i in enumerate(keys): k = keys[i] val = vals[i] if val != val: diff --git a/cellar/cellar_extractor/operative_extractions.py b/cellar/cellar_extractor/operative_extractions.py new file mode 100644 index 0000000..2414a1a --- /dev/null +++ b/cellar/cellar_extractor/operative_extractions.py @@ -0,0 +1,353 @@ +import csv +import json +import os +import requests +from bs4 import BeautifulSoup + +class Analyzer(): + """ + This class returns a list of the operative part for a given celex id. + Celex id is initialized through a constructor. + """ + celex: str="" # declare celex as a string + # declare url as a string + url:str="" + def __init__(self, celex): + # Initialize Celex id as a constructor, passed when calling the class + self.celex = celex + self.url = f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX%3A{self.celex}&from=EN" + + def html_page_structure_one(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a nested + table structure . The relevant text lies inside the coj-bold class of the span tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('table') # Find all tables tag from the website + one = [] + for divs in div: + # Find each nested table within the table + table = divs.find('table') + if table is not None: + # Find all p under the nested table with the coj-normal class + p = table.find_all('p', class_="coj-normal") + for x in p: + # Span class of coj-bold under the p tag + span = x.find_all('span', class_="coj-bold") + for y in span: + if x is not None and y is not None: + # append text from span onto a list + one.append(y.text) + return one + + def html_page_structure_two(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the normal class of the p tag which + comes after the keyword operative of the previous span tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + two = [] + for para in p: + span = para.find('span') + if span is not None: + if "operative" in span.text.lower(): + normal = span.find_all_next('p', class_="normal") + for op in normal: + two.append(op.text) + return two + + def structure_three(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a nested + table structure. The relevant text lies inside the coj-bold class of the span tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + table = parser.find_all('table') + three = [] + for tables in table: + interior = tables.find_all('table') + for interiors in interior: + if interiors is not None: + p = interiors.find_all('p', class_="coj-normal") + for x in p: + span = x.find_all('span', class_="coj-bold") + for y in span: + if x is not None and y is not None: + three.append(y.text) + return three + + def structure_four(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure . The relevant text lies inside the p tag which comes after the + keyword operative of the previous span tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + four = [] + for para in p: + span = para.find('span') + if span is not None: + if "operative" in span.text.lower(): + normal = span.find_all_next('table') + for op in normal: + tbody = op.find('tbody') + new_p = tbody.find_all('p', class_="oj-normal") + for subsequent in new_p: + if subsequent is not None: + four.append(subsequent.text) + return four + + def structure_five(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a paragraph + (p) structure. The relevant text lies inside the normal class of the p tag which + comes after the keyword operative of the previous span tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + p = parser.find_all('p') + five = [] + for para in p: + + span = para.find('span') + if span is not None: + if "operative" in span.text.lower(): + normal = span.find_all_next('table') + for op in normal: + tbody = op.find('tbody') + new_p = tbody.find_all('p', class_="normal") + for subsequent in new_p: + if subsequent is not None: + five.append(subsequent.text) + + return five + + def structure_six(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a h2 (header) structure. + The relevant text lies inside thee p tag which comes after the keyword operative + part of the respective h2 tag. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('h2') + six = [] + for h2 in div: + # print(h2.text) + if h2.text == "Operative part": + operatives = h2.find_all_next('p') + for operative in operatives: + + six.append(operative.text) + return six + + def structure_seven(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + This function scrapes/parse the operative part from a table + (table) structure. The relevant text lies inside the span tag which comes after + the p tag , with the class name=normal. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + div = parser.find_all('table') + seven = [] + for divs in div: + # find tbody within the table + table = divs.find_all('tbody') + for tables in table: + if tables is not None: + # find tr within the tbody + p = tables.find_all('tr') + for x in p: + if x is not None: + # find td within the tr + td = x.find_all('td') + for y in td: + if y is not None: + p = y.find_all('p', class_="normal") + for _all in p: + if _all is not None: + # find operative part within the span + span = _all.find_all( + 'span', class_="bold") + for spans in span: + # Append it into a list and return the + # list when the function is called + seven.append(spans.text) + return seven + + def structure_eight(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + The text is extracted from the span tag nested inside + the tbody tag.Returns a list as output. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + + tbody = parser.find_all('tbody') + eight = [] + for _all in tbody: + if _all is not None: + tr = _all.find_all('tr') + for trs in tr: + if trs is not None: + p = parser.find_all('p', class_="normal") + for paras in p: + if paras is not None: + if "on those grounds" in paras.text.lower(): + span = paras.find_all_next( + 'span', class_="bold") + for spans in span: + if spans is not None: + eight.append(spans.text) + return eight + + def structure_nine(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + The operative part is under the bold(b) + tag after the p tag where the keywords "on those grounds" exist. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + nine = [] + div = parser.find_all('p') + for divs in div: + if divs is not None: + if "on those grounds" in divs.text.lower(): + b = divs.find_all_next('b') + for bolds in b: + nine.append(bolds.text) + return nine + + def structure_eleven(self) -> list: + """ + This function retreives operative part from documents of the respected celex id's. + The operative part is under the paragraph(p) + tag after the b tag where the keywords "operative part" exist. + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + bold = parser.find_all('b') + + eleven = [] + + for b in bold: + if b is not None: + if "operative part" in b.text.lower(): + tables = b.find_all_next('p') + for table in tables: + if table is not None: + eleven.append(table.text) + return eleven + + def structure_ten(self): + """ + This function retreives operative part from documents of the respected celex id's. + Since the ocntent is preloaded using js/clients + server side functions , the text from the current page is retrieved and the + operative part is scraped after the occurence of the phrase + "On those grounds". + """ + website = requests.get(self.url, timeout=60).text + parser = BeautifulSoup(website, 'lxml') + appender = [] + for string in parser.stripped_strings: + + appender.append(string) + + found = False + after_grounds = [] + + for x in appender: + + if "on those grounds" in x.lower(): + found = True + + if found: + if len(x.split(" ")) > 3: + after_grounds.append(x) + return after_grounds + + def __call__(self) -> list: + """ + This inbuilt __call__ function loops through all the methods in the class + `Analyzer` and returns the list , with values of the operative part . + """ + + container = [self.html_page_structure_one(), self.html_page_structure_two(), + self.structure_three(), self.structure_four(), self.structure_five(), + self.structure_six(), self.structure_seven(), self.structure_eight(), + self.structure_nine(), self.structure_ten(), self.structure_eleven()] + + one: list + for funcs in range(len(container)): + one = container[funcs] + if one: + if (len(one) != 0 or one[0] != "\n"): + + return one + +class Writing(): + """ + This class has different methods, for the purpose of writing the operative part + into different file formats.(Csv,txt,json) + """ + instance: str + x: str + parameter: str + + current_dir = os.getcwd() + + txt_dir = os.path.join(current_dir, "txt") + csv_dir = os.path.join(current_dir, "csv") + json_dir = os.path.join(current_dir, "json") + + if not os.path.exists(txt_dir): + os.makedirs(txt_dir) + if not os.path.exists(csv_dir): + os.makedirs(csv_dir) + + if not os.path.exists(json_dir): + os.makedirs(json_dir) + + def __init__(self, celex: str): + self.celex = celex + self.instance = Analyzer(self.celex) + self.x = self.instance() + + + def to_csv(self): + _file = open("csv/output.csv", "a+", encoding="utf-8") + writer = csv.writer(_file) + if self.x is not None: + writer.writerow([self.celex, self.x]) + + def to_json(self): + if self.x is not None: + data = {'Celex': self.celex, "Operative part": self.x} + _file = open('json/data.json', 'a+', encoding='utf-8') + json.dump(data, _file) + _file.close() + + def to_txt(self): + if self.x is not None: + _file = open(f"txt/{self.celex}.txt", "a", encoding="utf-8") + for w in self.x: + _file.write(w+"\n") + _file.close() diff --git a/cellar/cellar_extractor/sparql.py b/cellar/cellar_extractor/sparql.py index de989ce..c41a49c 100644 --- a/cellar/cellar_extractor/sparql.py +++ b/cellar/cellar_extractor/sparql.py @@ -1,5 +1,6 @@ from SPARQLWrapper import SPARQLWrapper, JSON, CSV, POST import requests + def run_eurlex_webservice_query(query_input,username,password): target = "https://eur-lex.europa.eu/EURLexWebService?wsdl" query = ''' @@ -22,15 +23,12 @@ def run_eurlex_webservice_query(query_input,username,password): ''' % (username, password,query_input) return requests.request("POST", target, data=query, allow_redirects=True) - -""" -Method acquired from a different law and tech project for getting the citations of a source_celex. -Unlike get_citations_csv, only works for one source celex at once. Returns a set containing all the works cited by -the source celex. -""" - def get_citations(source_celex, cites_depth=1, cited_depth=1): """ + Method acquired from a different law and tech project for getting the citations of a + source_celex. + Unlike get_citations_csv, only works for one source celex at once. Returns a set + containing all the works cited by the source celex. Gets all the citations one to X steps away. Hops can be specified as either the source document citing another (defined by `cites_depth`) or another document citing it (`cited_depth`). Any numbers higher than 1 denote that new source document @@ -69,18 +67,16 @@ def get_citations(source_celex, cites_depth=1, cited_depth=1): for bind in ret['results']['bindings']: target = bind['name2']['value'] targets.add(target) - targets = set([el for el in list(targets)]) # Filters the list. Filter type: '3'=legislation, '6'=case law. - + # Filters the list. Filter type: '3'=legislation, '6'=case law. + targets = set([el for el in list(targets)]) return targets - -""" -Method sending a query to the endpoint, which asks for cited works for each celex. -The celex variable in the method is a list of all the celex identifiers of the cases we need the citations of. -The query returns a csv, containing all of the data needed.""" - - def get_citations_csv(celex): + """ + Method sending a query to the endpoint, which asks for cited works for each celex. + The celex variable in the method is a list of all the celex identifiers of the + cases we need the citations of. + The query returns a csv, containing all of the data needed.""" endpoint = 'https://publications.europa.eu/webapi/rdf/sparql' input_celex = '", "'.join(celex) query = ''' diff --git a/cellar/setup.py b/cellar/setup.py index eec4dce..50bab1b 100644 --- a/cellar/setup.py +++ b/cellar/setup.py @@ -9,7 +9,7 @@ setup( name='cellar_extractor', - packages=find_packages(include=['cellar_extractor']), + packages=find_packages(include=['cellar_extractor', 'cellar_extractor.operative_extractions']), version='1.0.61', description='Library for extracting cellar data', author='LawTech Lab', @@ -23,4 +23,4 @@ "Bug Tracker": "https://github.com/maastrichtlawtech/extraction_libraries", "Build Source": "https://github.com/maastrichtlawtech/extraction_libraries", }, -) \ No newline at end of file +) diff --git a/tests.py b/tests.py index 4732cdb..cea29f8 100644 --- a/tests.py +++ b/tests.py @@ -1,5 +1,7 @@ +import random from cellar_extractor import * + def cellar_csv_n(): get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=100) @@ -70,3 +72,80 @@ def test_cellar_json_n(): assert True except Exception: assert False, "Downloading cellar as json failed." + +def operative_part_csv(celex): + csv_store=Writing(celex) + try: + csv_store.to_csv() + assert True + except Exception: + assert False, "Downloading and storing as csv failed for operative part" + +def operative_part_json(celex): + json_store=Writing(celex) + try: + json_store.to_json() + assert True + except Exception: + assert False, "Downloading and storing as json failed for operative part" + +def operative_part_txt(celex): + txt_store=Writing(celex) + try: + txt_store.to_txt() + assert True + except Exception: + assert False, "Downloading and storing as txt failed for operative part" + +def for_operative_part(celex): + try: + test_output=Analyzer(celex) + test_output() + assert True + except Exception: + assert False, "Cannot extract for celex" + +def test_operative_part_txt(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)-1) + celex=celex_store[choice] + try: + operative_part_txt(celex) + assert True + except Exception: + assert False, "Cannot extract operative text" + +def test_operative_part_json(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)-1) + celex=celex_store[choice] + try: + operative_part_json(celex) + assert True + except Exception: + assert False, "Cannot extract operative text" + +def test_operative_part_csv(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)-1) + celex=celex_store[choice] + try: + operative_part_csv(celex) + assert True + except Exception: + assert False, "Cannot extract operative text" + +def test_for_operative_part(): + celex_store=["61983CJ0207","61988CJ0360","62005CJ0168","62008CJ0484","62010CJ0014","62005CJ0343","62000CJ0154"] + celex:str + choice=random.randint(0,len(celex_store)-1) + celex=celex_store[choice] + try: + for_operative_part(celex) + assert True + except Exception: + assert False, "Cannot extract operative part" +