trigaten · trigaten · May 1, 2024 · Mar 10, 2024 · Mar 10, 2024 · Mar 19, 2024
diff --git a/src/prompt_systematic_review/experiments/benchmarking.py b/src/prompt_systematic_review/experiments/benchmarking.py
@@ -173,7 +173,7 @@ def evaluate_prompts(
     config_name: str,
     split: str,
     model_name: str,
-    examples: None or int = 1,
+    examples: int = 1 or None,
     start_index: int = 0,
     log_interval: int = 25,
     max_tokens: int = 5000,
@@ -310,12 +310,14 @@ def evaluate_prompts(
                         {
                             "prompt_name": prompt.name,
                             "prompt": chosen_prompt,
-                            "question": "Question: "
-                            + multiple_choice_question
-                            + "\n\n"
-                            + multiple_choice_question
-                            if reread
-                            else multiple_choice_question,
+                            "question": (
+                                "Question: "
+                                + multiple_choice_question
+                                + "\n\n"
+                                + multiple_choice_question
+                                if reread
+                                else multiple_choice_question
+                            ),
                             "correct_answer": correct_answer,
                             "response": response_dict,
                             "mark": eval_result,
@@ -423,7 +425,7 @@ def write_to_file(data, count, log_interval=25):
         DataFolderPath,
         "experiments_output"
         + os.sep
-        + f"data/benchmarking/eval_results_{current_datetime}_part_{((count//log_interval))}.json",
+        + f"eval_results_{current_datetime}_part_{((count//log_interval))}.json",
     )
 
     with open(file_path, "w") as json_file:
@@ -553,7 +555,7 @@ def gen(self, category: str or None = None) -> str:
         ):  # if an MMLU category is provided, category only provided for few-shot prompts
             all_shots = {
                 "vanilla": {
-                    1: {  # few-shot prompts with format 1
+                    "1": {  # few-shot prompts with format 1
                         "STEM": [
                             "Problem \n\tA 0.217 g sample of HgO (molar mass = 217 g) reacts with excess iodide ions according to the reaction shown above. Titration of the resulting solution requires how many mL of 0.10 M HCl to reach equivalence point?\nOptions \n\t\n(A)::1.0 mL -- (B)::10 mL -- (C)::20 mL -- (D)::50 mL\n Answer\n\t(C)",
                             "Problem \n\tMany Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?\nOptions \n\t\n(A)::The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator. -- (B)::Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer. -- (C)::A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session. -- (D)::A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.\n Answer\n\t(B)",
@@ -583,7 +585,7 @@ def gen(self, category: str or None = None) -> str:
                             "Problem \n\tWhich of these songs was a Top 10 hit for the rock band The Police?\nOptions \n\t\n(A)::'Radio Ga-Ga' -- (B)::'Ob-la-di Ob-la-da' -- (C)::'De Do Do Do De Da Da Da' -- (D)::'In-a-Gadda-Da-Vida'\n Answer\n\t(C)",
                         ],
                     },
-                    2: {  # same few-shot prompts with format 2
+                    "2": {  # same few-shot prompts with format 2
                         "STEM": [
                             "PROBLEM::A 0.217 g sample of HgO (molar mass = 217 g) reacts with excess iodide ions according to the reaction shown above. Titration of the resulting solution requires how many mL of 0.10 M HCl to reach equivalence point?, OPTIONS:: \n(A): 1.0 mL\n(B): 10 mL\n(C): 20 mL\n(D): 50 mL, ANSWER::(C)",
                             "PROBLEM::Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?, OPTIONS:: \n(A): The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator.\n(B): Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer.\n(C): A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session.\n(D): A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded., ANSWER::(B)",
@@ -677,6 +679,9 @@ def gen(self, category: str or None = None) -> str:
                     },
                 },
             }
+            # print("Format number: ", self.format_num)
+            # print("Category: ", mmlu_split[category])
+            # print("CoT: ", self.CoT)
             shots = all_shots["CoT" if self.CoT else "vanilla"][str(self.format_num)][
                 mmlu_split[category]
             ]  # get the specific few-shot prompt set for the MMLU category group of the question

diff --git a/src/prompt_systematic_review/experiments/eval_prompts.py b/src/prompt_systematic_review/experiments/eval_prompts.py
@@ -122,7 +122,7 @@
 
 
 prompts_to_test = [
-    # zero_shot_baseline1_format1,
+    zero_shot_baseline1_format1,
     # zero_shot_baseline1_format2,
     # zero_shot_baseline2_format1,
     # zero_shot_baseline2_format2,
@@ -131,19 +131,19 @@
     # few_shot_baseline1_format1,
     # few_shot_baseline1_format2
     # zero_shot_CoT
-    # zero_shot_CoT1_format1,
+    zero_shot_CoT1_format1,
     # zero_shot_CoT1_format2,
     # zero_shot_CoT2_format1,
     # zero_shot_CoT2_format2,
     # zero_shot_CoT3_format1,
     # zero_shot_CoT3_format2,
-    # few_shot_baseline1_format1,
+    few_shot_baseline1_format1,
     # few_shot_baseline1_format2,
     # few_shot_baseline2_format1,
     # few_shot_baseline2_format2,
     # few_shot_baseline3_format1,
     # few_shot_baseline3_format2,
-    # few_shot_CoT1_format1,
+    few_shot_CoT1_format1,
     # few_shot_CoT1_format2,
     # few_shot_CoT2_format1,
     # few_shot_CoT2_format2,
@@ -154,16 +154,16 @@
 dataset = "mmlu"  # mmlu or gsm8k
 config_name = None  # main if gs8k, None if mmlu
 split = "test"
-# model = "gpt-4-1106-preview"
-model = "gpt-3.5-turbo"
-examples = 2800  # number of examples to test
+model = "gpt-4-1106-preview"
+# model = "gpt-3.5-turbo"
+examples = 50  # number of examples to test
 start = 0  # start index for dataset
 log_interval = 200  # log interval for creatings jsons of results by query
 max_toks = 700  # max tokens for query
 rereading = False  # if true, will "reread" the question to the LM at query time
 return_json = False
 SEED = 42
-temp = 0.5
+temp = 0.0
 
 
 def eval_prompts():
@@ -187,7 +187,7 @@ def eval_prompts():
     current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 
     # File path for the JSON file
-    d_path = f"data/benchmarking/eval_results_{current_datetime}.json"
+    d_path = f"eval_results_{current_datetime}.json"
 
     file_path = os.path.join(
         DataFolderPath,

diff --git a/src/prompt_systematic_review/get_papers/paperSource.py b/src/prompt_systematic_review/get_papers/paperSource.py
@@ -56,9 +56,9 @@ def to_dict(self):
             "title": self.title,
             "authors": self.authors,
             "url": self.url,
-            "dateSubmitted": self.dateSubmitted.isoformat()
-            if self.dateSubmitted
-            else None,
+            "dateSubmitted": (
+                self.dateSubmitted.isoformat() if self.dateSubmitted else None
+            ),
             "keyWords": self.keywords,
             "abstract": self.abstract,
             "paperId": self.paperId,

diff --git a/src/prompt_systematic_review/get_papers/semantic_scholar_source.py b/src/prompt_systematic_review/get_papers/semantic_scholar_source.py
@@ -50,12 +50,14 @@ def getPapers(self, count: int, key_words: List[str]) -> List[Paper]:
                         )
                         paper = Paper(
                             title=paper_data["title"],
-                            authors=[
-                                author["name"]
-                                for author in paper_data.get("authors", [])
-                            ]
-                            if paper_data["authors"]
-                            else "",
+                            authors=(
+                                [
+                                    author["name"]
+                                    for author in paper_data.get("authors", [])
+                                ]
+                                if paper_data["authors"]
+                                else ""
+                            ),
                             url=open_access_pdf_url,
                             dateSubmitted=publication_date,
                             keyWords=None,

diff --git a/tests/test_benchmarking.py b/tests/test_benchmarking.py
@@ -17,97 +17,6 @@
 import json
 
 
-@pytest.fixture(scope="module")
-def api_key():
-    load_dotenv(dotenv_path="./.env")  # Adjust the path as needed
-    return os.getenv("OPENAI_API_KEY")
-
-
-@pytest.mark.API_test
-def test_json_output(api_key):
-    openai.api_key = api_key
-    prompts = [
-        "You are a brilliant math professor. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible.",
-        "You are a foolish high-school student. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible.",
-    ]
-
-    model = "gpt-3.5-turbo-1106"
-
-    response = query_model(
-        prompts[0],
-        "What is 4 + 4?",
-        model_name=model,
-        output_tokens=150,
-    )
-
-    response_dict = response_to_dict(response)
-
-    # Check the main keys
-    assert "id" in response_dict
-    assert "model" in response_dict
-    assert "object" in response_dict
-    assert "created" in response_dict
-    assert "system_fingerprint" in response_dict
-    assert "choices" in response_dict
-    assert "usage" in response_dict
-
-    # Check the types of the main keys
-    assert isinstance(response_dict["id"], str)
-    assert isinstance(response_dict["model"], str)
-    assert isinstance(response_dict["object"], str)
-    assert isinstance(response_dict["created"], int)
-    assert isinstance(response_dict["system_fingerprint"], str)
-    assert isinstance(response_dict["choices"], list)
-    assert isinstance(response_dict["usage"], dict)
-
-    # Check the structure and types of the 'choices' key
-    assert len(response_dict["choices"]) > 0
-    for choice in response_dict["choices"]:
-        assert "finish_reason" in choice
-        assert "index" in choice
-        assert "message" in choice
-        assert isinstance(choice["finish_reason"], str)
-        assert isinstance(choice["index"], int)
-        assert isinstance(choice["message"], dict)
-        assert "content" in choice["message"]
-        assert "role" in choice["message"]
-        assert isinstance(choice["message"]["content"], str)
-        assert isinstance(choice["message"]["role"], str)
-
-    # Check the structure and types of the 'usage' key
-    assert "completion_tokens" in response_dict["usage"]
-    assert "prompt_tokens" in response_dict["usage"]
-    assert "total_tokens" in response_dict["usage"]
-    assert isinstance(response_dict["usage"]["completion_tokens"], int)
-    assert isinstance(response_dict["usage"]["prompt_tokens"], int)
-    assert isinstance(response_dict["usage"]["total_tokens"], int)
-
-
-@pytest.mark.API_test
-def test_query_model(api_key):
-    openai.api_key = api_key
-    prompt = "You are a brilliant math professor. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible."
-    question = "What is 4 + 4?"
-    model_name = "gpt-3.5-turbo-1106"
-    output_tokens = 150
-    response = query_model(prompt, question, model_name, output_tokens)
-    assert isinstance(response.choices[0].message.content, str)
-    assert len(response.choices[0].message.content) > 0
-    assert "8" in response.choices[0].message.content
-
-    prompt = 'You are a brilliant math professor. Solve the following problem and return a JSON with the first entry being the reasoning behind the choice labeled as "reasoning", and the second entry being the answer to the question containing only the letter "A", "B", "C" or "D", labeled as "answer". Try to keep your reasoning concise.'
-    question = "What is 4 + 4? A. 8 B. 9 C. 10 D. 11"
-    model_name = "gpt-3.5-turbo-1106"
-    output_tokens = 150
-    json_mode = True
-    response = query_model(
-        prompt, question, model_name, output_tokens, return_json=json_mode
-    )
-    json_response = json.loads(response.choices[0].message.content)
-    assert isinstance(json_response, dict)
-    assert json_response["answer"] == "A"
-
-
 def test_with_commas_and_dollar_sign():
     assert extract_numbers("####$1,000") == [1000]
     assert extract_numbers("####$1,000.00") == [1000]
@@ -148,21 +57,6 @@ def test_multiple_numbers():
     assert extract_numbers("####1000 ####2000") == [1000, 2000]
 
 
-# def test_load_mmlu():
-#     with open("data/mmlu_configs.json", "r") as file:
-#         mmlu_configs = json.load(file)["configs"]
-#     df = load_mmlu(mmlu_configs, "test")
-
-#     assert df.iloc[0]["input"] == "When was the telescope invented by Galileo?"
-#     assert df.iloc[0].A == "1409"
-#     assert df.iloc[0].B == "1509"
-#     assert df.iloc[0].C == "1609"
-#     assert df.iloc[0].D == "1709"
-#     assert df.iloc[0].answer == "C"
-#     assert df.iloc[0].config == "astronomy"
-#     assert len(df) == 13911
-
-
 def test_modular_prompts():
     with open("data/prompts.json", "r") as file:
         prompts = json.load(file)