-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation_data_generator.py
executable file
·86 lines (68 loc) · 2.45 KB
/
evaluation_data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import json
import os
from time import sleep
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm
load_dotenv()
client = OpenAI(
base_url="https://models.inference.ai.azure.com",
api_key=os.environ["GITHUB_TOKEN"]
)
df = pd.read_csv('../data/kenya_health_facilities_clean.csv')
documents = df.to_dict(orient='records')
prompt_template = """
You emulate a user of our healthcare facility assistant application.
Formulate 5 diverse questions this user might ask based on the provided healthcare facility. The record
should contain the answer to the questions, and the questions should be complete and not too short.
Ensure the questions are diverse and not similar to each other. Use as fewer words as possible from the record.
The record:
name: {name}
keph_level: {keph_level}
facility_type: {facility_type}
owner: {owner}
regulatory_body: {regulatory_body}
beds: {beds}
cots: {cots}
county: {county}
constituency: {constituency}
sub_county: {sub_county}
ward: {ward}
operation_status: {operation_status}
open_whole_day: {open_whole_day}
open_public_holidays: {open_public_holidays}
open_weekends: {open_weekends}
open_late_night: {open_late_night}
approved: {approved}
public_visible: {public_visible}
closed: {closed}
Provide the output in parsable JSON without using code blocks:
["question1", "question2", ..., "question5"]
""".strip()
def generate_questions(doc):
prompt = prompt_template.format(**doc)
response = client.chat.completions.create(
model='gpt-4o',
messages=[{"role": "user", "content": prompt}]
)
return json.loads(response.choices[0].message.content)
def main():
results = {}
# Load existing results from CSV file
existing_results = pd.read_csv('../data/ground_truth_retrieval.csv')
existing_doc_ids = set(existing_results['id'])
for i, doc in enumerate(tqdm(documents)):
doc_id = doc['id']
if doc_id in existing_doc_ids:
continue
questions = generate_questions(doc)
results[doc_id] = questions
if (i + 1) % 10 == 0:
sleep(60)
final_results = [(doc_id, q) for doc_id, questions in results.items() for q in questions]
df_results = pd.DataFrame(final_results, columns=['id', 'question'])
existing_results = pd.concat([existing_results, df_results], ignore_index=True)
existing_results.to_csv('../data/ground_truth_data.csv', index=False)
if __name__ == "__main__":
main()