Skip to content

Commit 2743aad

Browse files
committed
- Add comprehensive_bench.sh, test_all_datasets.py, test_token_comparison.py
- Add new dataset implementations: AQUA-RAT, DROP, GSM8K, MATH, OpenBookQA, SciQ, StrategyQA - Update router_reason_bench_multi_dataset.py with adaptive max token - Improved answer extraction and evaluation logic for multiple answer formats Signed-off-by: Huamin Chen <hchen@redhat.com>
1 parent ede160f commit 2743aad

12 files changed

+2052
-136
lines changed

bench/comprehensive_bench.sh

Lines changed: 167 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
1212
VLLM_ENDPOINT="http://127.0.0.1:8000/v1"
1313
VLLM_MODEL="" # Will be auto-detected from endpoint if not specified
1414
ROUTER_MODEL="auto"
15+
CONCURRENT_REQUESTS=8
1516
OUTPUT_BASE="results/comprehensive_research_$(date +%Y%m%d_%H%M%S)"
1617

1718
# Parse command line arguments
@@ -105,12 +106,28 @@ PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
105106
# Dataset configurations (dataset_name:samples_per_category)
106107
# Balanced for statistical significance vs runtime
107108
declare -A DATASET_CONFIGS=(
108-
["mmlu"]=10 # 57 subjects × 10 = 570 samples
109-
["arc"]=15 # 1 category × 15 = 15 samples
110-
["gpqa"]=20 # 1 category × 20 = 20 samples
111-
["truthfulqa"]=15 # 1 category × 15 = 15 samples
112-
["commonsenseqa"]=20 # 1 category × 20 = 20 samples
113-
["hellaswag"]=8 # ~50 activities × 8 = ~400 samples
109+
# Core proven datasets
110+
["gpqa"]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
111+
["mmlu"]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
112+
["truthfulqa"]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
113+
114+
# Mathematical reasoning datasets
115+
# ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub
116+
["gsm8k"]=25 # Elementary math word problems - EXPECTED good reasoning differentiation
117+
["aqua-rat"]=20 # Algebraic word problems with rationales - EXPECTED good differentiation
118+
119+
# Multi-step reasoning datasets
120+
["drop"]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
121+
["strategyqa"]=20 # Multi-step implicit reasoning - EXPECTED good differentiation
122+
123+
# Scientific reasoning datasets
124+
["sciq"]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation
125+
["openbookqa"]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation
126+
127+
# Disabled datasets with poor reasoning differentiation:
128+
# ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
129+
# ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
130+
# ["hellaswag"]=2 # Minimal differentiation, not reasoning-focused
114131
)
115132

116133
echo -e "${BLUE}🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC}"
@@ -136,14 +153,17 @@ source "$VENV_PATH/bin/activate"
136153
mkdir -p "$OUTPUT_BASE"
137154
mkdir -p "$(dirname "$PERSISTENT_RESEARCH_CSV")"
138155

139-
# Initialize persistent research results CSV (create header only if file doesn't exist)
140-
if [[ ! -f "$PERSISTENT_RESEARCH_CSV" ]]; then
141-
echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV"
142-
echo -e "${GREEN}📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
143-
else
144-
echo -e "${BLUE}📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
156+
# Backup and clear master research CSV for fresh results
157+
if [[ -f "$PERSISTENT_RESEARCH_CSV" ]]; then
158+
BACKUP_CSV="${PERSISTENT_RESEARCH_CSV}.backup_$(date +%Y%m%d_%H%M%S)"
159+
cp "$PERSISTENT_RESEARCH_CSV" "$BACKUP_CSV"
160+
echo -e "${GREEN}📊 Backed up existing master CSV to: $BACKUP_CSV${NC}"
145161
fi
146162

163+
# Create fresh master research CSV with header only
164+
echo "Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > "$PERSISTENT_RESEARCH_CSV"
165+
echo -e "${GREEN}📊 Created fresh master research CSV: $PERSISTENT_RESEARCH_CSV${NC}"
166+
147167
# Also create a timestamped copy for this run
148168
RESEARCH_CSV="$OUTPUT_BASE/research_results.csv"
149169
cp "$PERSISTENT_RESEARCH_CSV" "$RESEARCH_CSV"
@@ -225,9 +245,12 @@ try:
225245
model_name = '$VLLM_MODEL'
226246
227247
# For vLLM, we might have multiple modes (NR, NR_REASONING)
228-
if '$mode' == 'vllm' and 'mode' in df.columns:
229-
for mode_type in df['mode'].unique():
230-
mode_df = df[df['mode'] == mode_type]
248+
# Check both 'mode' and 'mode_label' columns for mode information
249+
if '$mode' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
250+
# Use mode_label if available (more descriptive), otherwise use mode
251+
mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
252+
for mode_type in df[mode_column].unique():
253+
mode_df = df[df[mode_column] == mode_type]
231254
232255
# Recalculate metrics for this specific mode using correct column names
233256
if 'is_correct' in mode_df.columns:
@@ -253,7 +276,17 @@ try:
253276
254277
mode_samples = len(mode_df)
255278
256-
csv_line = f'$dataset,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
279+
# Map technical mode names to descriptive names
280+
if mode_type == 'VLLM_NR':
281+
display_mode = 'vLLM_No_Reasoning'
282+
elif mode_type == 'VLLM_NR_REASONING':
283+
display_mode = 'vLLM_All_Reasoning'
284+
elif mode_type == 'VLLM_XC':
285+
display_mode = 'vLLM_CoT'
286+
else:
287+
display_mode = mode_type # Use the mode_label as-is if not recognized
288+
289+
csv_line = f'$dataset,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp'
257290
print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr)
258291
print(csv_line)
259292
else:
@@ -283,14 +316,17 @@ run_dataset_benchmark() {
283316

284317
echo -e "${GREEN}📊 Benchmarking $dataset dataset ($samples samples per category)...${NC}"
285318

286-
# Router benchmark
319+
# Router benchmark (pass vLLM info for consistent token calculation)
287320
echo -e "${YELLOW} 🤖 Running router evaluation...${NC}"
288321
python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
289322
--dataset "$dataset" \
290323
--samples-per-category "$samples" \
291324
--run-router \
292325
--router-endpoint "$ROUTER_ENDPOINT" \
293326
--router-models "$ROUTER_MODEL" \
327+
--vllm-endpoint "$VLLM_ENDPOINT" \
328+
--vllm-models "$VLLM_MODEL" \
329+
--concurrent-requests "$CONCURRENT_REQUESTS" \
294330
--output-dir "$OUTPUT_BASE/router_$dataset" \
295331
--seed 42
296332

@@ -307,41 +343,104 @@ run_dataset_benchmark() {
307343
--vllm-models "$VLLM_MODEL" \
308344
--vllm-exec-modes NR NR_REASONING \
309345
--output-dir "$OUTPUT_BASE/vllm_$dataset" \
346+
--concurrent-requests "$CONCURRENT_REQUESTS" \
310347
--seed 42
311348

312349
# Extract and save vLLM metrics immediately
313350
extract_and_save_metrics "$dataset" "vllm" "$OUTPUT_BASE/vllm_$dataset"
314351

315-
echo -e "${GREEN} ✅ Completed $dataset benchmark${NC}"
352+
# Generate updated comprehensive plots for current dataset
353+
echo -e "${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC}"
354+
generate_comprehensive_plot "$dataset"
355+
356+
echo -e "${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC}"
357+
echo -e "${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV${NC}"
316358
echo ""
317359
}
318360

319-
# Function to generate comparison plots
320-
generate_plots() {
321-
echo -e "${BLUE}📈 Generating comparison plots...${NC}"
361+
# Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
362+
generate_comprehensive_plot() {
363+
local current_dataset=$1
322364

323-
for dataset in "${!DATASET_CONFIGS[@]}"; do
324-
echo -e "${YELLOW} 📊 Plotting $dataset results...${NC}"
365+
if [[ -n "$current_dataset" ]]; then
366+
echo -e "${YELLOW} 📊 Generating plot for current dataset: $current_dataset...${NC}"
367+
else
368+
echo -e "${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC}"
369+
fi
370+
371+
# Use the plot_comprehensive_results.py script to generate updated charts
372+
if [[ -f "plot_comprehensive_results.py" ]]; then
373+
echo -e "${BLUE} Running comprehensive plotting script...${NC}"
374+
# Use the current run's CSV instead of the master CSV to show only this run's results
375+
PLOT_CMD="python3 plot_comprehensive_results.py \
376+
--csv \"$RESEARCH_CSV\" \
377+
--output-dir \"$OUTPUT_BASE\" \
378+
--model-filter \"$VLLM_MODEL\""
379+
380+
# Add dataset filter if specified
381+
if [[ -n "$current_dataset" ]]; then
382+
PLOT_CMD="$PLOT_CMD --dataset-filter \"$current_dataset\""
383+
fi
384+
385+
eval $PLOT_CMD
325386

326-
# Find the summary.json files
327-
ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
328-
VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
387+
echo -e "${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE${NC}"
388+
389+
# Print actual paths of generated charts
390+
if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then
391+
echo -e "${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE/accuracy_comparison.png${NC}"
392+
fi
393+
if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then
394+
echo -e "${GREEN} 📊 Token Usage Chart: $OUTPUT_BASE/token_usage_comparison.png${NC}"
395+
fi
396+
if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then
397+
echo -e "${GREEN} 📊 Efficiency Chart: $OUTPUT_BASE/efficiency_analysis.png${NC}"
398+
fi
399+
else
400+
echo -e "${RED} ⚠️ plot_comprehensive_results.py not found, skipping comprehensive plots${NC}"
401+
fi
402+
}
403+
404+
# Function to generate plot for a single dataset (kept for compatibility)
405+
generate_dataset_plot() {
406+
local dataset=$1
407+
408+
echo -e "${YELLOW} 📊 Plotting $dataset results...${NC}"
409+
410+
# Find the summary.json files
411+
ROUTER_SUMMARY=$(find "$OUTPUT_BASE/router_$dataset" -name "summary.json" -type f | head -1)
412+
VLLM_SUMMARY=$(find "$OUTPUT_BASE/vllm_$dataset" -name "summary.json" -type f | head -1)
329413

330-
if [[ -f "$VLLM_SUMMARY" ]]; then
331-
PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\""
414+
if [[ -f "$VLLM_SUMMARY" ]]; then
415+
PLOT_CMD="python3 -m vllm_semantic_router_bench.bench_plot --summary \"$VLLM_SUMMARY\" --out-dir \"$OUTPUT_BASE/plots_$dataset\""
332416

333-
if [[ -f "$ROUTER_SUMMARY" ]]; then
334-
PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\""
335-
fi
417+
if [[ -f "$ROUTER_SUMMARY" ]]; then
418+
PLOT_CMD="$PLOT_CMD --router-summary \"$ROUTER_SUMMARY\""
419+
fi
420+
421+
echo -e "${BLUE} Running: $PLOT_CMD${NC}"
422+
eval $PLOT_CMD
423+
echo -e "${GREEN}$dataset plots generated in $OUTPUT_BASE/plots_$dataset${NC}"
424+
else
425+
echo -e "${RED} ⚠️ No vLLM summary.json found for $dataset, skipping plots${NC}"
426+
fi
427+
}
336428

337-
echo -e "${BLUE} Running: $PLOT_CMD${NC}"
338-
eval $PLOT_CMD
429+
# Function to generate comparison plots (now just calls individual dataset plots)
430+
generate_plots() {
431+
echo -e "${BLUE}📈 Generating any remaining comparison plots...${NC}"
432+
433+
for dataset in "${!DATASET_CONFIGS[@]}"; do
434+
# Check if plots already exist
435+
if [[ ! -d "$OUTPUT_BASE/plots_$dataset" ]]; then
436+
echo -e "${YELLOW} 📊 Generating missing plots for $dataset...${NC}"
437+
generate_dataset_plot "$dataset"
339438
else
340-
echo -e "${RED} ⚠️ No vLLM summary.json found for $dataset, skipping plots${NC}"
439+
echo -e "${GREEN} ✅ Plots for $dataset already exist${NC}"
341440
fi
342441
done
343442

344-
echo -e "${GREEN} ✅ All plots generated${NC}"
443+
echo -e "${GREEN} ✅ All plots verified/generated${NC}"
345444
echo ""
346445
}
347446

@@ -372,8 +471,8 @@ EOF
372471
"mmlu")
373472
echo "| MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> "$summary_file"
374473
;;
375-
"arc")
376-
echo "| ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> "$summary_file"
474+
"arc-challenge")
475+
echo "| ARC-Challenge | $samples | $samples | 1 (Science) | Scientific Reasoning (Hard) |" >> "$summary_file"
377476
;;
378477
"gpqa")
379478
echo "| GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> "$summary_file"
@@ -385,7 +484,7 @@ EOF
385484
echo "| CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> "$summary_file"
386485
;;
387486
"hellaswag")
388-
echo "| HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> "$summary_file"
487+
echo "| HellaSwag | $samples | ~100 | ~50 activities | Commonsense NLI |" >> "$summary_file"
389488
;;
390489
esac
391490
done
@@ -398,8 +497,8 @@ EOF
398497
399498
### Accuracy Comparison
400499
- Router (auto model with reasoning): See research_results.csv
401-
- vLLM Direct (NR mode): See research_results.csv
402-
- vLLM Direct (NR_REASONING mode): See research_results.csv
500+
- vLLM Direct (No Reasoning): See research_results.csv
501+
- vLLM Direct (All Reasoning): See research_results.csv
403502
404503
### Token Usage Analysis
405504
- Average tokens per response by dataset and mode (in research_results.csv)
@@ -448,7 +547,7 @@ EOF
448547
449548
- **Seed**: 42 (for reproducibility)
450549
- **Router Mode**: Auto model selection with reasoning
451-
- **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning)
550+
- **vLLM Modes**: No Reasoning and All Reasoning
452551
- **Sample Strategy**: Stratified sampling per category
453552
- **Evaluation**: Exact match accuracy and token usage
454553
@@ -462,9 +561,24 @@ EOF
462561
echo -e "${BLUE}🚀 Starting comprehensive benchmark...${NC}"
463562
start_time=$(date +%s)
464563

465-
# Run benchmarks for all datasets
466-
for dataset in "${!DATASET_CONFIGS[@]}"; do
564+
# Run benchmarks for reasoning-focused datasets (GPQA first for quick feedback)
565+
DATASET_ORDER=("gpqa" "truthfulqa" "gsm8k" "aqua-rat" "sciq" "openbookqa" "strategyqa" "drop" "mmlu")
566+
dataset_count=0
567+
total_datasets=${#DATASET_ORDER[@]}
568+
569+
for dataset in "${DATASET_ORDER[@]}"; do
570+
# Skip if dataset not configured
571+
if [[ -z "${DATASET_CONFIGS[$dataset]}" ]]; then
572+
echo -e "${YELLOW}⚠️ Dataset $dataset not configured, skipping...${NC}"
573+
continue
574+
fi
575+
576+
dataset_count=$((dataset_count + 1))
577+
echo -e "${BLUE}🚀 Progress: Dataset $dataset_count/$total_datasets - Starting $dataset${NC}"
467578
run_dataset_benchmark "$dataset"
579+
echo -e "${GREEN}🎉 Progress: Dataset $dataset_count/$total_datasets - Completed $dataset${NC}"
580+
echo -e "${YELLOW}📊 Remaining datasets: $((total_datasets - dataset_count))${NC}"
581+
echo ""
468582
done
469583

470584
# Generate plots
@@ -489,7 +603,16 @@ echo -e "${BLUE}📋 Next Steps:${NC}"
489603
echo "1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV"
490604
echo "2. 📊 **This run's data**: $OUTPUT_BASE/research_results.csv"
491605
echo "3. 📋 Review research summary: $OUTPUT_BASE/RESEARCH_SUMMARY.md"
492-
echo "4. 📈 Examine plots for visual insights"
606+
echo "4. 📈 **View comprehensive charts**:"
607+
if [[ -f "$OUTPUT_BASE/accuracy_comparison.png" ]]; then
608+
echo " 📊 Accuracy: $OUTPUT_BASE/accuracy_comparison.png"
609+
fi
610+
if [[ -f "$OUTPUT_BASE/token_usage_comparison.png" ]]; then
611+
echo " 📊 Token Usage: $OUTPUT_BASE/token_usage_comparison.png"
612+
fi
613+
if [[ -f "$OUTPUT_BASE/efficiency_analysis.png" ]]; then
614+
echo " 📊 Efficiency: $OUTPUT_BASE/efficiency_analysis.png"
615+
fi
493616
echo "5. 📄 Analyze detailed CSV files if needed"
494617
echo ""
495618
echo -e "${GREEN}🎓 Research CSV Format:${NC}"

0 commit comments

Comments
 (0)