Skip to content

feat: fixed data going to non-existent folder. #155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions src/prompt_systematic_review/experiments/benchmarking.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def evaluate_prompts(
config_name: str,
split: str,
model_name: str,
examples: None or int = 1,
examples: int = 1 or None,
start_index: int = 0,
log_interval: int = 25,
max_tokens: int = 5000,
Expand Down Expand Up @@ -310,12 +310,14 @@ def evaluate_prompts(
{
"prompt_name": prompt.name,
"prompt": chosen_prompt,
"question": "Question: "
+ multiple_choice_question
+ "\n\n"
+ multiple_choice_question
if reread
else multiple_choice_question,
"question": (
"Question: "
+ multiple_choice_question
+ "\n\n"
+ multiple_choice_question
if reread
else multiple_choice_question
),
"correct_answer": correct_answer,
"response": response_dict,
"mark": eval_result,
Expand Down Expand Up @@ -423,7 +425,7 @@ def write_to_file(data, count, log_interval=25):
DataFolderPath,
"experiments_output"
+ os.sep
+ f"data/benchmarking/eval_results_{current_datetime}_part_{((count//log_interval))}.json",
+ f"eval_results_{current_datetime}_part_{((count//log_interval))}.json",
)

with open(file_path, "w") as json_file:
Expand Down Expand Up @@ -553,7 +555,7 @@ def gen(self, category: str or None = None) -> str:
): # if an MMLU category is provided, category only provided for few-shot prompts
all_shots = {
"vanilla": {
1: { # few-shot prompts with format 1
"1": { # few-shot prompts with format 1
"STEM": [
"Problem \n\tA 0.217 g sample of HgO (molar mass = 217 g) reacts with excess iodide ions according to the reaction shown above. Titration of the resulting solution requires how many mL of 0.10 M HCl to reach equivalence point?\nOptions \n\t\n(A)::1.0 mL -- (B)::10 mL -- (C)::20 mL -- (D)::50 mL\n Answer\n\t(C)",
"Problem \n\tMany Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?\nOptions \n\t\n(A)::The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator. -- (B)::Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer. -- (C)::A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session. -- (D)::A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded.\n Answer\n\t(B)",
Expand Down Expand Up @@ -583,7 +585,7 @@ def gen(self, category: str or None = None) -> str:
"Problem \n\tWhich of these songs was a Top 10 hit for the rock band The Police?\nOptions \n\t\n(A)::'Radio Ga-Ga' -- (B)::'Ob-la-di Ob-la-da' -- (C)::'De Do Do Do De Da Da Da' -- (D)::'In-a-Gadda-Da-Vida'\n Answer\n\t(C)",
],
},
2: { # same few-shot prompts with format 2
"2": { # same few-shot prompts with format 2
"STEM": [
"PROBLEM::A 0.217 g sample of HgO (molar mass = 217 g) reacts with excess iodide ions according to the reaction shown above. Titration of the resulting solution requires how many mL of 0.10 M HCl to reach equivalence point?, OPTIONS:: \n(A): 1.0 mL\n(B): 10 mL\n(C): 20 mL\n(D): 50 mL, ANSWER::(C)",
"PROBLEM::Many Web browsers allow users to open anonymous windows. During a browsing session in an anonymous window, the browser does not record a browsing history or a list of downloaded files. When the anonymous window is exited, cookies created during the session are deleted. Which of the following statements about browsing sessions in an anonymous window is true?, OPTIONS:: \n(A): The activities of a user browsing in an anonymous window will not be visible to people who monitor the user's network, such as the system administrator.\n(B): Items placed in a Web store's shopping cart for future purchase during the anonymous browsing session will not be saved on the user's computer.\n(C): A user will not be able to log in to e-mail or social media accounts during the anonymous browsing session.\n(D): A user browsing in an anonymous window will be protected from viruses launched from any web sites visited or files downloaded., ANSWER::(B)",
Expand Down Expand Up @@ -677,6 +679,9 @@ def gen(self, category: str or None = None) -> str:
},
},
}
# print("Format number: ", self.format_num)
# print("Category: ", mmlu_split[category])
# print("CoT: ", self.CoT)
shots = all_shots["CoT" if self.CoT else "vanilla"][str(self.format_num)][
mmlu_split[category]
] # get the specific few-shot prompt set for the MMLU category group of the question
Expand Down
18 changes: 9 additions & 9 deletions src/prompt_systematic_review/experiments/eval_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@


prompts_to_test = [
# zero_shot_baseline1_format1,
zero_shot_baseline1_format1,
# zero_shot_baseline1_format2,
# zero_shot_baseline2_format1,
# zero_shot_baseline2_format2,
Expand All @@ -131,19 +131,19 @@
# few_shot_baseline1_format1,
# few_shot_baseline1_format2
# zero_shot_CoT
# zero_shot_CoT1_format1,
zero_shot_CoT1_format1,
# zero_shot_CoT1_format2,
# zero_shot_CoT2_format1,
# zero_shot_CoT2_format2,
# zero_shot_CoT3_format1,
# zero_shot_CoT3_format2,
# few_shot_baseline1_format1,
few_shot_baseline1_format1,
# few_shot_baseline1_format2,
# few_shot_baseline2_format1,
# few_shot_baseline2_format2,
# few_shot_baseline3_format1,
# few_shot_baseline3_format2,
# few_shot_CoT1_format1,
few_shot_CoT1_format1,
# few_shot_CoT1_format2,
# few_shot_CoT2_format1,
# few_shot_CoT2_format2,
Expand All @@ -154,16 +154,16 @@
dataset = "mmlu" # mmlu or gsm8k
config_name = None # main if gs8k, None if mmlu
split = "test"
# model = "gpt-4-1106-preview"
model = "gpt-3.5-turbo"
examples = 2800 # number of examples to test
model = "gpt-4-1106-preview"
# model = "gpt-3.5-turbo"
examples = 50 # number of examples to test
start = 0 # start index for dataset
log_interval = 200 # log interval for creatings jsons of results by query
max_toks = 700 # max tokens for query
rereading = False # if true, will "reread" the question to the LM at query time
return_json = False
SEED = 42
temp = 0.5
temp = 0.0


def eval_prompts():
Expand All @@ -187,7 +187,7 @@ def eval_prompts():
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# File path for the JSON file
d_path = f"data/benchmarking/eval_results_{current_datetime}.json"
d_path = f"eval_results_{current_datetime}.json"

file_path = os.path.join(
DataFolderPath,
Expand Down
6 changes: 3 additions & 3 deletions src/prompt_systematic_review/get_papers/paperSource.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def to_dict(self):
"title": self.title,
"authors": self.authors,
"url": self.url,
"dateSubmitted": self.dateSubmitted.isoformat()
if self.dateSubmitted
else None,
"dateSubmitted": (
self.dateSubmitted.isoformat() if self.dateSubmitted else None
),
"keyWords": self.keywords,
"abstract": self.abstract,
"paperId": self.paperId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ def getPapers(self, count: int, key_words: List[str]) -> List[Paper]:
)
paper = Paper(
title=paper_data["title"],
authors=[
author["name"]
for author in paper_data.get("authors", [])
]
if paper_data["authors"]
else "",
authors=(
[
author["name"]
for author in paper_data.get("authors", [])
]
if paper_data["authors"]
else ""
),
url=open_access_pdf_url,
dateSubmitted=publication_date,
keyWords=None,
Expand Down
106 changes: 0 additions & 106 deletions tests/test_benchmarking.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,97 +17,6 @@
import json


@pytest.fixture(scope="module")
def api_key():
load_dotenv(dotenv_path="./.env") # Adjust the path as needed
return os.getenv("OPENAI_API_KEY")


@pytest.mark.API_test
def test_json_output(api_key):
openai.api_key = api_key
prompts = [
"You are a brilliant math professor. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible.",
"You are a foolish high-school student. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible.",
]

model = "gpt-3.5-turbo-1106"

response = query_model(
prompts[0],
"What is 4 + 4?",
model_name=model,
output_tokens=150,
)

response_dict = response_to_dict(response)

# Check the main keys
assert "id" in response_dict
assert "model" in response_dict
assert "object" in response_dict
assert "created" in response_dict
assert "system_fingerprint" in response_dict
assert "choices" in response_dict
assert "usage" in response_dict

# Check the types of the main keys
assert isinstance(response_dict["id"], str)
assert isinstance(response_dict["model"], str)
assert isinstance(response_dict["object"], str)
assert isinstance(response_dict["created"], int)
assert isinstance(response_dict["system_fingerprint"], str)
assert isinstance(response_dict["choices"], list)
assert isinstance(response_dict["usage"], dict)

# Check the structure and types of the 'choices' key
assert len(response_dict["choices"]) > 0
for choice in response_dict["choices"]:
assert "finish_reason" in choice
assert "index" in choice
assert "message" in choice
assert isinstance(choice["finish_reason"], str)
assert isinstance(choice["index"], int)
assert isinstance(choice["message"], dict)
assert "content" in choice["message"]
assert "role" in choice["message"]
assert isinstance(choice["message"]["content"], str)
assert isinstance(choice["message"]["role"], str)

# Check the structure and types of the 'usage' key
assert "completion_tokens" in response_dict["usage"]
assert "prompt_tokens" in response_dict["usage"]
assert "total_tokens" in response_dict["usage"]
assert isinstance(response_dict["usage"]["completion_tokens"], int)
assert isinstance(response_dict["usage"]["prompt_tokens"], int)
assert isinstance(response_dict["usage"]["total_tokens"], int)


@pytest.mark.API_test
def test_query_model(api_key):
openai.api_key = api_key
prompt = "You are a brilliant math professor. Solve the following problem and put your answer after four hashtags like the following example: \nQuestion: What is 4 + 4?\nAnswer: 4 + 4 is ####8\n\n Make your response as short as possible."
question = "What is 4 + 4?"
model_name = "gpt-3.5-turbo-1106"
output_tokens = 150
response = query_model(prompt, question, model_name, output_tokens)
assert isinstance(response.choices[0].message.content, str)
assert len(response.choices[0].message.content) > 0
assert "8" in response.choices[0].message.content

prompt = 'You are a brilliant math professor. Solve the following problem and return a JSON with the first entry being the reasoning behind the choice labeled as "reasoning", and the second entry being the answer to the question containing only the letter "A", "B", "C" or "D", labeled as "answer". Try to keep your reasoning concise.'
question = "What is 4 + 4? A. 8 B. 9 C. 10 D. 11"
model_name = "gpt-3.5-turbo-1106"
output_tokens = 150
json_mode = True
response = query_model(
prompt, question, model_name, output_tokens, return_json=json_mode
)
json_response = json.loads(response.choices[0].message.content)
assert isinstance(json_response, dict)
assert json_response["answer"] == "A"


def test_with_commas_and_dollar_sign():
assert extract_numbers("####$1,000") == [1000]
assert extract_numbers("####$1,000.00") == [1000]
Expand Down Expand Up @@ -148,21 +57,6 @@ def test_multiple_numbers():
assert extract_numbers("####1000 ####2000") == [1000, 2000]


# def test_load_mmlu():
# with open("data/mmlu_configs.json", "r") as file:
# mmlu_configs = json.load(file)["configs"]
# df = load_mmlu(mmlu_configs, "test")

# assert df.iloc[0]["input"] == "When was the telescope invented by Galileo?"
# assert df.iloc[0].A == "1409"
# assert df.iloc[0].B == "1509"
# assert df.iloc[0].C == "1609"
# assert df.iloc[0].D == "1709"
# assert df.iloc[0].answer == "C"
# assert df.iloc[0].config == "astronomy"
# assert len(df) == 13911


def test_modular_prompts():
with open("data/prompts.json", "r") as file:
prompts = json.load(file)
Expand Down