Merge pull request #161 from trigaten/Experiments-Update

fix: update experiments
trigaten · May 29, 2024 · 07bd0f6 · 07bd0f6
2 parents f74460a + 84fb279
commit 07bd0f6
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 40 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,4 +3,5 @@ repos:
     rev: 23.10.1
     hooks:
     -   id: black
-        exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py
+        exclude: src/prompt_systematic_review/experiments/find_internal_reference_count.py
+        exclude: src/prompt_systematic_review/experiments/graph_internal_references.py
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ For HF: https://huggingface.co/docs/hub/security-tokens, also run `huggingface-c
 Put your key in like:
 
 `OPENAI_API_KEY=sk-...`
+`SEMANTIC_SCHOLAR_API_KEY=...`
 `HF_TOKEN=...`
 
 Then to load the .env file, type:
@@ -24,25 +25,21 @@ py.test --envfile path/to/.env
 In the case that you have several .env files, create a new env_files in the pytest config folder and type:
 
 env_files =
-    .env
-    .test.env
-    .deploy.env
-    
+.env
+.test.env
+.deploy.env
+
 ## blacklist.csv
 
 Papers we should not include due to being poorly written or AI generated
 
-
-
 ## Notes
 
 - Sometimes a paper title may appear differently on the arXiv API. For example, "Visual Attention-Prompted Prediction and Learning" (arXiv:2310.08420), according to arXiv API is titled "A visual encoding model based on deep neural networks and transfer learning"
 
 - When testing APIs, there may be latency and aborted connections
 
 - Publication dates of papers from IEEE are missing the day about half the time. They also may come in any of the following formats
-    - "April 1988"
-    - "2-4 April 2002"
-    - "29 Nov.-2 Dec. 2022"
-
-
+  - "April 1988"
+  - "2-4 April 2002"
+  - "29 Nov.-2 Dec. 2022"
diff --git a/src/prompt_systematic_review/experiments/__init__.py b/src/prompt_systematic_review/experiments/__init__.py
@@ -12,7 +12,9 @@
 from . import graph_gpt_3_5_benchmarks
 from . import run_tomotopy
 from . import topicgpt
-
+from . import download_mmlu
+from . import graph_internal_references
+from . import graph
 
 experiments = [
     count_tool_mentions.Experiment,
@@ -28,4 +30,7 @@
     graph_gpt_3_5_benchmarks.Experiment,
     run_tomotopy.Experiment,
     topicgpt.Experiment,
+    download_mmlu.Experiment,
+    graph_internal_references.Experiment,
+    graph.Experiment,
 ]
diff --git a/src/prompt_systematic_review/experiments/download_mmlu.py b/src/prompt_systematic_review/experiments/download_mmlu.py
@@ -30,26 +30,34 @@ def move_and_rename_extracted_contents(extracted_folder, final_folder, new_folde
     return mmlu_folder
 
 
-# URL of the .tar file
-url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+def download_mmlu():
+    # URL of the .tar file
+    url = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
 
-# Temporary paths
-download_path = "./data.tar"
-extract_path = "./extracted"
+    # Temporary paths
+    download_path = "./data.tar"
+    extract_path = "./extracted"
 
-# Final path
-final_data_folder = "./data"
-final_folder_name = "mmlu"
+    # Final path
+    final_data_folder = "./data"
+    final_folder_name = "mmlu"
 
-# Download and extract the file
-download_and_extract(url, download_path)
-extract_tar(download_path, extract_path)
+    # Download and extract the file
+    download_and_extract(url, download_path)
+    extract_tar(download_path, extract_path)
+
+    # Move and rename the contents of the extracted folder
+    move_and_rename_extracted_contents(
+        extract_path, final_data_folder, final_folder_name
+    )
+
+    # Cleanup
+    if os.path.exists(download_path):
+        os.remove(download_path)
+    if os.path.exists(extract_path):
+        shutil.rmtree(extract_path)
 
-# Move and rename the contents of the extracted folder
-move_and_rename_extracted_contents(extract_path, final_data_folder, final_folder_name)
 
-# Cleanup
-if os.path.exists(download_path):
-    os.remove(download_path)
-if os.path.exists(extract_path):
-    shutil.rmtree(extract_path)
+class Experiment:
+    def run():
+        download_mmlu()
diff --git a/src/prompt_systematic_review/experiments/graph.py b/src/prompt_systematic_review/experiments/graph.py
@@ -102,7 +102,7 @@ def run(self, csv_file_path, technique_to_title):
         )
 
 
-if __name__ == "__main__":
+def run_graph():
     main = Main()
     titles = [
         "Bounding the Capabilities of Large Language Models in Open Text Generation with Prompt Constraints",
@@ -208,3 +208,8 @@ def run(self, csv_file_path, technique_to_title):
 
     csv_file_path = "path_to_your_csv.csv"
     main.run(csv_file_path, technique_to_title)
+
+
+class Experiment:
+    def run():
+        run_graph()
diff --git a/src/prompt_systematic_review/experiments/graph_internal_references.py b/src/prompt_systematic_review/experiments/graph_internal_references.py
@@ -7,7 +7,6 @@
 from dotenv import load_dotenv
 import csv
 import random
-import scipy
 import networkx as nx
 import matplotlib.pyplot as plt
 import textwrap
@@ -188,19 +187,19 @@ def process_papers(self, csv_file_path):
                             arxiv_paper_id
                         )
                     else:
-                        unmatched_papers[
-                            row.get("title", "").strip()
-                        ] = "Source not supported"
+                        unmatched_papers[row.get("title", "").strip()] = (
+                            "Source not supported"
+                        )
                         continue
 
                     if paper_id:
                         references = self.semantic_scholar_api.get_references(paper_id)
                         if references is not None:
                             paper_references[paper_id] = references
                         else:
-                            unmatched_papers[
-                                row["title"]
-                            ] = "No references found or error occurred"
+                            unmatched_papers[row["title"]] = (
+                                "No references found or error occurred"
+                            )
                     else:
                         print(f"Paper Id Could not be found for: {row}")
         else:
@@ -428,7 +427,7 @@ def visualize_chart(self, technique_to_title):
         )
 
 
-if __name__ == "__main__":
+def graph_internal_references():
     main = Main()
 
     titles = [
@@ -533,3 +532,8 @@ def visualize_chart(self, technique_to_title):
         "Rephrase and Respond: Let Large Language Models Ask Better Questions for Themselves": "Rephrase and Respond",
     }
     main.visualize_chart(technique_to_title)
+
+
+class Experiment:
+    def run():
+        graph_internal_references()