Merge pull request #8 from maastrichtlawtech/extraction_operative_key…

…words Added new methods for extracting operative part for a given CELEX id.
maastrichtlawtech · May 1, 2024 · 1807938 · 1807938
2 parents 560fc4d + 6368651
commit 1807938
Show file tree

Hide file tree

Showing 19 changed files with 811 additions and 398 deletions.
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install cellar-extractor
+          pip install -e cellar/
      # pip install echr-extractor
       - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
       - run: echo "🖥️ The workflow is now ready to test your code on the runner."

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-venv
+.venv*
 .idea
 data
 rechtspraak/rechtspraak_extractor/tests/data
@@ -20,4 +20,6 @@ rechtspraak.zip
 build.bat
 echr_extractor-whl.zip
 echr_extractor-whl
-echr_extractor.egg-info
+echr_extractor.egg-info
+
+.*DS_Store
diff --git a/cellar/README.md b/cellar/README.md
@@ -37,6 +37,13 @@ Python 3.9
             <sub><b>gijsvd</b></sub>
         </a>
     </td>
+       <td align="center">
+        <a href="https://github.com/venvis">
+            <img src="https://avatars.githubusercontent.com/venvis" width="100;" alt="venvis"/>
+            <br />
+            <sub><b>venvis</b></sub>
+        </a>
+    </td>
 </tr>
 </table>
 <!-- readme: contributors,gijsvd -end -->
@@ -59,6 +66,16 @@ Python 3.9
     Allows the creation of a network graph of the citations. Can only be returned in-memory.
     <li><code>filter_subject_matter</code></li>
     Returns a dataframe of cases only containing a certain phrase in the column containing the subject of cases.
+    <li><code>Analyzer</code></li>
+    A class whose instance(declaration) when called returns a list of the all the text contained within the operative part for each European Court of Justice (CJEU, formerly known as European Court of Justice (ECJ)) judgement (English only).
+    <li><code>Writing</code></li>
+    A class which writes the text for the operative part for each European Case law case(En-English only) into csv,json and txt files(Generated upon initialization).<br>
+    the <code>Writing</code> class has three functions : <br><br>
+    <ul>
+        <li><code>to_csv()</code> - Writes the operative part along with celex id into a csv file</li>
+        <li><code>to_json()</code> - Writes the operative part along with celex id into a json file</li>
+        <li><code>to_txt()</code> - Writes the operative part along with celex id into a txt file</li>
+    </ul>
     <br>
 </ol>
 
@@ -115,11 +132,22 @@ Python 3.9
         <li><strong>phrase: string, required, default None</strong></li>
         The phrase which has to be present in the subject matter of cases. Case insensitive.
     </ul>
+     <li><code>Analyzer</code></li>
+    <ul>
+        <li><strong>celex id: str, required</strong></li>
+        <li>Pass as a constructor upon initializing the class</li>
+    </ul>
+    <li><code>Writing</code></li>
+        <ul>
+        <li><strong>celex id: str, required</strong></li>
+            <li>Pass as a constructor upon initializing the class</li>
+    </ul>
+
 </ol>
 
 
 ## Examples
-```
+```python
 import cellar_extractor as cell
 
 Below are examples for in-file saving:
@@ -132,7 +160,26 @@ Below are examples for in-memory saving:
 df = cell.get_cellar(save_file='n', file_format='csv', sd='2022-01-01', max_ecli=1000)
 df,json = cell.get_cellar_extra(save_file='n', max_ecli=100, sd='2022-01-01', threads=10)
 ```
+<p>Create a callback of the instance of the class initiated and pass a list as it's value.</p>
+
+```python
+import cellar_extractor as cell
+instance=cell.Analyzer(celex_id:str)
+output_list=instance()
+print(output_list) # prints operative part of the Case as a list
+```
+
 
+<p>The Writing Class also takes a celex id , upon initializing the class , through the means of the constructor and writes the content of its operative part into different files , depending on the function called</p>
+
+```python
+import cellar_extractor as cell
+instance=cell.Writing(celex_id:str)
+output=instance.to_csv()#for csv
+output=instance.to_txt()#for txt
+output=instance.to_json()#for json
+
+```
 
 ## License
 [![License: Apache 2.0](https://img.shields.io/github/license/maastrichtlawtech/extraction_libraries)](https://opensource.org/licenses/Apache-2.0)

diff --git a/cellar/__init__.py b/cellar/__init__.py
@@ -0,0 +1,2 @@
+from cellar_extractor import *
+
diff --git a/cellar/cellar_extractor/Testing_file.py b/cellar/cellar_extractor/Testing_file.py
diff --git a/cellar/cellar_extractor/__init__.py b/cellar/cellar_extractor/__init__.py
@@ -2,5 +2,7 @@
 from cellar_extractor.cellar import get_cellar_extra
 from cellar_extractor.cellar import get_nodes_and_edges_lists
 from cellar_extractor.cellar import filter_subject_matter
+from cellar_extractor.operative_extractions import Analyzer
+from cellar_extractor.operative_extractions import Writing
 import logging
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.INFO)
diff --git a/cellar/cellar_extractor/cellar.py b/cellar/cellar_extractor/cellar.py
@@ -11,7 +11,6 @@
 from cellar_extractor.json_to_csv import json_to_csv_main, json_to_csv_returning
 from cellar_extractor.nodes_and_edges import get_nodes_and_edges
 
-
 def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_format='csv'):
     if not ed:
         ed = datetime.now().isoformat(timespec='seconds')
@@ -40,7 +39,7 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
             json_to_csv_main(all_eclis, file_path)
         else:
             file_path = os.path.join('data', file_name + '.json')
-            with open(file_path, "w") as f:
+            with open(file_path, "w", encoding="utf-8") as f:
                 json.dump(all_eclis, f)
     else:
         if file_format == 'csv':
@@ -51,7 +50,8 @@ def get_cellar(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", file_forma
     logging.info("\n--- DONE ---")
 
 
-def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", threads=10, username="", password=""):
+def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01",
+                     threads=10, username="", password=""):
     if not ed:
         ed = datetime.now().isoformat(timespec='seconds')
     data = get_cellar(ed=ed, save_file='n', max_ecli=max_ecli, sd=sd, file_format='csv')
@@ -64,15 +64,16 @@ def get_cellar_extra(ed=None, save_file='y', max_ecli=100, sd="2022-05-01", thre
     file_path = os.path.join('data', file_name + '.csv')
     if save_file == 'y':
         Path('data').mkdir(parents=True, exist_ok=True)
-        extra_cellar(data=data, filepath=file_path, threads=threads, username=username, password=password)
+        extra_cellar(data=data, filepath=file_path, threads=threads,
+                     username=username, password=password)
         logging.info("\n--- DONE ---")
 
     else:
-        data, json = extra_cellar(data=data, threads=threads, username=username, password=password)
+        data, json_data = extra_cellar(data=data, threads=threads,
+                                 username=username, password=password)
         logging.info("\n--- DONE ---")
 
-        return data, json
-
+        return data,json_data
 
 def get_nodes_and_edges_lists(df=None, only_local=False):
     if df is None:

diff --git a/cellar/cellar_extractor/cellar_extra_extract.py b/cellar/cellar_extractor/cellar_extra_extract.py
@@ -4,17 +4,44 @@
 
 
 def extra_cellar(data=None, filepath=None, threads=10, username="", password=""):
+    """
+    Extracts information from a cellar dataset.
+
+    Args:
+        data (pandas.DataFrame, optional): The input dataset. If not provided, 
+        it will be read from the specified filepath.
+        filepath (str, optional): The path to the input dataset file. If provided, 
+        the data will be read from this file.
+        threads (int, optional): The number of threads to use for parallel 
+        processing. Default is 10.
+        username (str, optional): The username for accessing a separate 
+        webservice. Default is an empty string.
+        password (str, optional): The password for accessing a separate 
+        webservice. Default is an empty string.
+
+    Returns:
+        tuple: A tuple containing the modified dataset and a JSON object.
+
+    If `data` is not provided, the dataset will be read from the specified
+    `filepath`. 
+
+    If `username` and `password` are provided, the function will add
+    citations using a separate webservice.
+
+    The function will add sections to the dataset using the specified 
+    number of `threads`. If `filepath` is provided,
+    the modified dataset will be saved to the same file. Otherwise, the 
+    modified dataset and a JSON object will be returned.
+    """
     if data is None:
         data = read_csv(filepath)
     if filepath:
         if username !="" and password !="":
             add_citations_separate_webservice(data, username, password)
-            #print("Citations successfully added. The rest of additional extraction will now happen.")
         add_sections(data, threads, filepath.replace(".csv", "_fulltext.json"))
         data.to_csv(filepath, index=False)
     else:
         if username != "" and password != "":
             add_citations_separate_webservice(data, username, password)
-            #print("Citations successfully added. The rest of additional extraction will now happen.")
         json = add_sections(data, threads)
         return data, json
diff --git a/cellar/cellar_extractor/cellar_queries.py b/cellar/cellar_extractor/cellar_queries.py
@@ -48,18 +48,23 @@ def get_all_eclis(starting_date=None, ending_date=None):
     return eclis
 
 
-def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, force_readable_vals=False):
+def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True,
+                            force_readable_vals=False):
     """Gets cellar metadata
 
     :param eclis: The ECLIs for which to retrieve metadata
     :type eclis: list[str]
-    :param get_labels: Flag to get human-readable labels for the properties, defaults to True
+    :param get_labels: Flag to get human-readable labels for the properties, 
+    defaults to True
     :type get_labels: bool, optional
-    :param force_readable_cols: Flag to remove any non-labelled properties from the resulting dict, defaults to True
+    :param force_readable_cols: Flag to remove any non-labelled properties 
+    from the resulting dict, defaults to True
     :type force_readable_cols: bool, optional
-    :param force_readable_vals: Flag to remove any non-labelled values from the resulting dict, defaults to False
+    :param force_readable_vals: Flag to remove any non-labelled values from 
+    the resulting dict, defaults to False
     :type force_readable_vals: bool, optional
-    :return: Dictionary containing metadata. Top-level keys are ECLIs, second level are property names
+    :return: Dictionary containing metadata. Top-level keys are ECLIs, second 
+    level are property names
     :rtype: Dict[str, Dict[str, list[str]]]
     """
 
@@ -100,8 +105,8 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo
     for ecli in eclis:
         metadata[ecli] = {}
 
-    # Take each triple, check which source doc it belongs to, key/value pair into its dict derived from the p and o in
-    # the query
+    # Take each triple, check which source doc it belongs to, key/value pair
+    # into its dict derived from the p and o in the query
     for res in ret['results']['bindings']:
         ecli = res['ecli']['value']
         # We only want cdm predicates
@@ -125,8 +130,9 @@ def get_raw_cellar_metadata(eclis, get_labels=True, force_readable_cols=True, fo
         else:
             val = res['o']['value']
 
-        # We store the values for each property in a list. For some properties this is not necessary,
-        # but if a property can be assigned multiple times, this is important. Notable, for example is citations.b
+        # We store the values for each property in a list. For some properties
+        # this is not necessary, but if a property can be assigned multiple
+        # times, this is important. Notable, for example is citations.
         if key in metadata[ecli]:
             metadata[ecli][key].append(val)
         else: