diff --git a/experiments/conf/config.yaml b/experiments/conf/config.yaml index 28949ad..cf01287 100644 --- a/experiments/conf/config.yaml +++ b/experiments/conf/config.yaml @@ -4,7 +4,7 @@ defaults: - dataset: medqa # [options: usmle, medmcqa, mmlu, pubmedqa, medqa, ciar, cosmosqa, gpqa] - _self_ -max_eval_count: 100 +max_eval_count: None num_eval_workers: 1 # Each worker receives a full batch of questions. eval_batch_size: 10 # Defaults to batch_size=1. verbose: False diff --git a/experiments/conf/system/chateval.yaml b/experiments/conf/system/chateval.yaml index 244f22d..8c2ad4d 100644 --- a/experiments/conf/system/chateval.yaml +++ b/experiments/conf/system/chateval.yaml @@ -13,13 +13,13 @@ agreement_intensity: -1 # -1 for default behavior, otherwise {0, ..., 10} for ag agents: # options: [gpt, palm] # Agent 1 - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.cot}" # Agent 2 - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.cot}" # Summarizer - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.summarizer}" diff --git a/experiments/conf/system/debateqa.yaml b/experiments/conf/system/debateqa.yaml index a5334eb..25bd063 100644 --- a/experiments/conf/system/debateqa.yaml +++ b/experiments/conf/system/debateqa.yaml @@ -15,9 +15,9 @@ agents: # options: [gpt, palm] # - prompt: "${system.agent_prompts.cot}" # GPT-3.5 agent - - - "${system.gpt}" - - engine: "gpt-3.5-turbo-0613" # gpt uses gpt-3.5 engine - - prompt: "${system.agent_prompts.cot}" + # - - "${system.gpt}" + # - engine: "gpt-3.5-turbo-0613" # gpt uses gpt-3.5 engine + # - prompt: "${system.agent_prompts.cot}" # PaLM agent - - "${system.palm}" # palm uses default setup diff --git a/experiments/conf/system/ensemble_refinement.yaml b/experiments/conf/system/ensemble_refinement.yaml index 3d26c8c..e862d22 100644 --- a/experiments/conf/system/ensemble_refinement.yaml +++ b/experiments/conf/system/ensemble_refinement.yaml @@ -15,7 +15,7 @@ num_aggregation_steps: 1 agents: # options: [gpt, palm] # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.er_simple}" - few_shot_examples: ${system.medpalm_examples.few_shot} # False, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} - sampling: diff --git a/experiments/conf/system/google_mad.yaml b/experiments/conf/system/google_mad.yaml index 119052d..79fcb9c 100644 --- a/experiments/conf/system/google_mad.yaml +++ b/experiments/conf/system/google_mad.yaml @@ -14,18 +14,18 @@ agreement_intensity: -1 # -1 for default behavior, otherwise {0, ..., 10} for ag agents: # options: [gpt, palm] # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.simple}" - few_shot_examples: ${system.medpalm_examples.few_shot} # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.simple}" - few_shot_examples: ${system.medpalm_examples.few_shot} # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.simple}" - few_shot_examples: ${system.medpalm_examples.few_shot} # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} diff --git a/experiments/conf/system/gpt.yaml b/experiments/conf/system/gpt.yaml index 138c72f..b2577a2 100644 --- a/experiments/conf/system/gpt.yaml +++ b/experiments/conf/system/gpt.yaml @@ -17,7 +17,7 @@ defaults: # options: [simple, cot, letter, explain...]: any prompt used in 'agen gpt: _target_: debatellm.agents.GPT prompt: ${system.agent_prompts.simple} - engine: "gpt-3.5-turbo-0613" + engine: "mixtral-8x7b-instruct" few_shot_examples: False # Options include: [False, ${system.medpalm_examples.few_shot}, ${system.medpalm_examples.cot_few_shot}] mock: False sampling: diff --git a/experiments/conf/system/medprompt.yaml b/experiments/conf/system/medprompt.yaml index db02d5b..753a6fc 100644 --- a/experiments/conf/system/medprompt.yaml +++ b/experiments/conf/system/medprompt.yaml @@ -13,7 +13,7 @@ name: medprompt agents: # options: [gpt, palm] # GPT-3 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-4 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-4 engine - cost_per_prompt_token: 0.03 # dollar costs per 1000 prompt token - cost_per_response_token: 0.06 # dollar costs per 1000 response token - prompt: "${system.agent_prompts.cot_medprompt}" diff --git a/experiments/conf/system/multi_agent_debate.yaml b/experiments/conf/system/multi_agent_debate.yaml index 005cff3..e289252 100644 --- a/experiments/conf/system/multi_agent_debate.yaml +++ b/experiments/conf/system/multi_agent_debate.yaml @@ -11,9 +11,9 @@ num_rounds: 2 agents: # options: [gpt, palm] # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.cot}" # GPT-3.5 agent - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.cot}" diff --git a/experiments/conf/system/spp_synergy.yaml b/experiments/conf/system/spp_synergy.yaml index e289810..86ce6d7 100644 --- a/experiments/conf/system/spp_synergy.yaml +++ b/experiments/conf/system/spp_synergy.yaml @@ -10,5 +10,5 @@ name: spp_synergy # Used for distinguishing between single agent GPT. agents: # options: [gpt, palm] # Agent 1 - - "${system.gpt}" - - engine: "gpt-4" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.spp_original}" # options: [spp_expert, spp_original] diff --git a/experiments/conf/system/tsinghua_mad.yaml b/experiments/conf/system/tsinghua_mad.yaml index b0b95c3..d66434d 100644 --- a/experiments/conf/system/tsinghua_mad.yaml +++ b/experiments/conf/system/tsinghua_mad.yaml @@ -14,21 +14,21 @@ agreement_intensity: -1 # -1: fallback to default prompt. [0, 1, ..., 10]: agree agents: # options: [gpt, palm] # Agent 1 - - "${system.gpt}" - - engine: "gpt-3.5-turbo-0613" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.angel}" - prompt_from_history: "tsinghua_mad" - few_shot_examples: None # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} # Agent 2 - - "${system.gpt}" - - engine: "gpt-3.5-turbo-0613" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.devil}" - prompt_from_history: "tsinghua_mad" - few_shot_examples: None # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} # Judge - - "${system.gpt}" - - engine: "gpt-3.5-turbo-0613" # gpt uses gpt-3.5 engine + - engine: "mixtral-8x7b-instruct" # gpt uses gpt-3.5 engine - prompt: "${system.agent_prompts.judge_tsinghua}" - prompt_from_history: "tsinghua_judge" - few_shot_examples: None # None, ${system.medpalm_examples.few_shot} or ${system.medpalm_examples.cot_few_shot} diff --git a/scripts/launch_experiments.py b/scripts/launch_experiments.py index f82c076..07b54a8 100644 --- a/scripts/launch_experiments.py +++ b/scripts/launch_experiments.py @@ -204,9 +204,9 @@ # Add all 3 datasets to the experiments for exp in exp_table: exp["dataset"] = [ - # "cosmosqa", - # "ciar", - # "gpqa", + "cosmosqa", + "ciar", + "gpqa", "medqa", "pubmedqa", "mmlu",