@@ -12,6 +12,7 @@ ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
12
12
VLLM_ENDPOINT=" http://127.0.0.1:8000/v1"
13
13
VLLM_MODEL=" " # Will be auto-detected from endpoint if not specified
14
14
ROUTER_MODEL=" auto"
15
+ CONCURRENT_REQUESTS=8
15
16
OUTPUT_BASE=" results/comprehensive_research_$( date +%Y%m%d_%H%M%S) "
16
17
17
18
# Parse command line arguments
@@ -105,12 +106,28 @@ PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
105
106
# Dataset configurations (dataset_name:samples_per_category)
106
107
# Balanced for statistical significance vs runtime
107
108
declare -A DATASET_CONFIGS=(
108
- [" mmlu" ]=10 # 57 subjects × 10 = 570 samples
109
- [" arc" ]=15 # 1 category × 15 = 15 samples
110
- [" gpqa" ]=20 # 1 category × 20 = 20 samples
111
- [" truthfulqa" ]=15 # 1 category × 15 = 15 samples
112
- [" commonsenseqa" ]=20 # 1 category × 20 = 20 samples
113
- [" hellaswag" ]=8 # ~50 activities × 8 = ~400 samples
109
+ # Core proven datasets
110
+ [" gpqa" ]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
111
+ [" mmlu" ]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
112
+ [" truthfulqa" ]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
113
+
114
+ # Mathematical reasoning datasets
115
+ # ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub
116
+ [" gsm8k" ]=25 # Elementary math word problems - EXPECTED good reasoning differentiation
117
+ [" aqua-rat" ]=20 # Algebraic word problems with rationales - EXPECTED good differentiation
118
+
119
+ # Multi-step reasoning datasets
120
+ [" drop" ]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
121
+ [" strategyqa" ]=20 # Multi-step implicit reasoning - EXPECTED good differentiation
122
+
123
+ # Scientific reasoning datasets
124
+ [" sciq" ]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation
125
+ [" openbookqa" ]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation
126
+
127
+ # Disabled datasets with poor reasoning differentiation:
128
+ # ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
129
+ # ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
130
+ # ["hellaswag"]=2 # Minimal differentiation, not reasoning-focused
114
131
)
115
132
116
133
echo -e " ${BLUE} 🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC} "
@@ -136,14 +153,17 @@ source "$VENV_PATH/bin/activate"
136
153
mkdir -p " $OUTPUT_BASE "
137
154
mkdir -p " $( dirname " $PERSISTENT_RESEARCH_CSV " ) "
138
155
139
- # Initialize persistent research results CSV (create header only if file doesn't exist)
140
- if [[ ! -f " $PERSISTENT_RESEARCH_CSV " ]]; then
141
- echo " Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > " $PERSISTENT_RESEARCH_CSV "
142
- echo -e " ${GREEN} 📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
143
- else
144
- echo -e " ${BLUE} 📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
156
+ # Backup and clear master research CSV for fresh results
157
+ if [[ -f " $PERSISTENT_RESEARCH_CSV " ]]; then
158
+ BACKUP_CSV=" ${PERSISTENT_RESEARCH_CSV} .backup_$( date +%Y%m%d_%H%M%S) "
159
+ cp " $PERSISTENT_RESEARCH_CSV " " $BACKUP_CSV "
160
+ echo -e " ${GREEN} 📊 Backed up existing master CSV to: $BACKUP_CSV ${NC} "
145
161
fi
146
162
163
+ # Create fresh master research CSV with header only
164
+ echo " Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > " $PERSISTENT_RESEARCH_CSV "
165
+ echo -e " ${GREEN} 📊 Created fresh master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
166
+
147
167
# Also create a timestamped copy for this run
148
168
RESEARCH_CSV=" $OUTPUT_BASE /research_results.csv"
149
169
cp " $PERSISTENT_RESEARCH_CSV " " $RESEARCH_CSV "
@@ -225,9 +245,12 @@ try:
225
245
model_name = '$VLLM_MODEL '
226
246
227
247
# For vLLM, we might have multiple modes (NR, NR_REASONING)
228
- if '$mode ' == 'vllm' and 'mode' in df.columns:
229
- for mode_type in df['mode'].unique():
230
- mode_df = df[df['mode'] == mode_type]
248
+ # Check both 'mode' and 'mode_label' columns for mode information
249
+ if '$mode ' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
250
+ # Use mode_label if available (more descriptive), otherwise use mode
251
+ mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
252
+ for mode_type in df[mode_column].unique():
253
+ mode_df = df[df[mode_column] == mode_type]
231
254
232
255
# Recalculate metrics for this specific mode using correct column names
233
256
if 'is_correct' in mode_df.columns:
@@ -253,7 +276,17 @@ try:
253
276
254
277
mode_samples = len(mode_df)
255
278
256
- csv_line = f'$dataset ,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp '
279
+ # Map technical mode names to descriptive names
280
+ if mode_type == 'VLLM_NR':
281
+ display_mode = 'vLLM_No_Reasoning'
282
+ elif mode_type == 'VLLM_NR_REASONING':
283
+ display_mode = 'vLLM_All_Reasoning'
284
+ elif mode_type == 'VLLM_XC':
285
+ display_mode = 'vLLM_CoT'
286
+ else:
287
+ display_mode = mode_type # Use the mode_label as-is if not recognized
288
+
289
+ csv_line = f'$dataset ,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp '
257
290
print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr)
258
291
print(csv_line)
259
292
else:
@@ -283,14 +316,17 @@ run_dataset_benchmark() {
283
316
284
317
echo -e " ${GREEN} 📊 Benchmarking $dataset dataset ($samples samples per category)...${NC} "
285
318
286
- # Router benchmark
319
+ # Router benchmark (pass vLLM info for consistent token calculation)
287
320
echo -e " ${YELLOW} 🤖 Running router evaluation...${NC} "
288
321
python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
289
322
--dataset " $dataset " \
290
323
--samples-per-category " $samples " \
291
324
--run-router \
292
325
--router-endpoint " $ROUTER_ENDPOINT " \
293
326
--router-models " $ROUTER_MODEL " \
327
+ --vllm-endpoint " $VLLM_ENDPOINT " \
328
+ --vllm-models " $VLLM_MODEL " \
329
+ --concurrent-requests " $CONCURRENT_REQUESTS " \
294
330
--output-dir " $OUTPUT_BASE /router_$dataset " \
295
331
--seed 42
296
332
@@ -307,41 +343,104 @@ run_dataset_benchmark() {
307
343
--vllm-models " $VLLM_MODEL " \
308
344
--vllm-exec-modes NR NR_REASONING \
309
345
--output-dir " $OUTPUT_BASE /vllm_$dataset " \
346
+ --concurrent-requests " $CONCURRENT_REQUESTS " \
310
347
--seed 42
311
348
312
349
# Extract and save vLLM metrics immediately
313
350
extract_and_save_metrics " $dataset " " vllm" " $OUTPUT_BASE /vllm_$dataset "
314
351
315
- echo -e " ${GREEN} ✅ Completed $dataset benchmark${NC} "
352
+ # Generate updated comprehensive plots for current dataset
353
+ echo -e " ${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC} "
354
+ generate_comprehensive_plot " $dataset "
355
+
356
+ echo -e " ${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC} "
357
+ echo -e " ${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV ${NC} "
316
358
echo " "
317
359
}
318
360
319
- # Function to generate comparison plots
320
- generate_plots () {
321
- echo -e " ${BLUE} 📈 Generating comparison plots... ${NC} "
361
+ # Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
362
+ generate_comprehensive_plot () {
363
+ local current_dataset= $1
322
364
323
- for dataset in " ${! DATASET_CONFIGS[@]} " ; do
324
- echo -e " ${YELLOW} 📊 Plotting $dataset results...${NC} "
365
+ if [[ -n " $current_dataset " ]]; then
366
+ echo -e " ${YELLOW} 📊 Generating plot for current dataset: $current_dataset ...${NC} "
367
+ else
368
+ echo -e " ${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC} "
369
+ fi
370
+
371
+ # Use the plot_comprehensive_results.py script to generate updated charts
372
+ if [[ -f " plot_comprehensive_results.py" ]]; then
373
+ echo -e " ${BLUE} Running comprehensive plotting script...${NC} "
374
+ # Use the current run's CSV instead of the master CSV to show only this run's results
375
+ PLOT_CMD=" python3 plot_comprehensive_results.py \
376
+ --csv \" $RESEARCH_CSV \" \
377
+ --output-dir \" $OUTPUT_BASE \" \
378
+ --model-filter \" $VLLM_MODEL \" "
379
+
380
+ # Add dataset filter if specified
381
+ if [[ -n " $current_dataset " ]]; then
382
+ PLOT_CMD=" $PLOT_CMD --dataset-filter \" $current_dataset \" "
383
+ fi
384
+
385
+ eval $PLOT_CMD
325
386
326
- # Find the summary.json files
327
- ROUTER_SUMMARY=$( find " $OUTPUT_BASE /router_$dataset " -name " summary.json" -type f | head -1)
328
- VLLM_SUMMARY=$( find " $OUTPUT_BASE /vllm_$dataset " -name " summary.json" -type f | head -1)
387
+ echo -e " ${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE ${NC} "
388
+
389
+ # Print actual paths of generated charts
390
+ if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
391
+ echo -e " ${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE /accuracy_comparison.png${NC} "
392
+ fi
393
+ if [[ -f " $OUTPUT_BASE /token_usage_comparison.png" ]]; then
394
+ echo -e " ${GREEN} 📊 Token Usage Chart: $OUTPUT_BASE /token_usage_comparison.png${NC} "
395
+ fi
396
+ if [[ -f " $OUTPUT_BASE /efficiency_analysis.png" ]]; then
397
+ echo -e " ${GREEN} 📊 Efficiency Chart: $OUTPUT_BASE /efficiency_analysis.png${NC} "
398
+ fi
399
+ else
400
+ echo -e " ${RED} ⚠️ plot_comprehensive_results.py not found, skipping comprehensive plots${NC} "
401
+ fi
402
+ }
403
+
404
+ # Function to generate plot for a single dataset (kept for compatibility)
405
+ generate_dataset_plot () {
406
+ local dataset=$1
407
+
408
+ echo -e " ${YELLOW} 📊 Plotting $dataset results...${NC} "
409
+
410
+ # Find the summary.json files
411
+ ROUTER_SUMMARY=$( find " $OUTPUT_BASE /router_$dataset " -name " summary.json" -type f | head -1)
412
+ VLLM_SUMMARY=$( find " $OUTPUT_BASE /vllm_$dataset " -name " summary.json" -type f | head -1)
329
413
330
- if [[ -f " $VLLM_SUMMARY " ]]; then
331
- PLOT_CMD=" python3 -m vllm_semantic_router_bench.bench_plot --summary \" $VLLM_SUMMARY \" --out-dir \" $OUTPUT_BASE /plots_$dataset \" "
414
+ if [[ -f " $VLLM_SUMMARY " ]]; then
415
+ PLOT_CMD=" python3 -m vllm_semantic_router_bench.bench_plot --summary \" $VLLM_SUMMARY \" --out-dir \" $OUTPUT_BASE /plots_$dataset \" "
332
416
333
- if [[ -f " $ROUTER_SUMMARY " ]]; then
334
- PLOT_CMD=" $PLOT_CMD --router-summary \" $ROUTER_SUMMARY \" "
335
- fi
417
+ if [[ -f " $ROUTER_SUMMARY " ]]; then
418
+ PLOT_CMD=" $PLOT_CMD --router-summary \" $ROUTER_SUMMARY \" "
419
+ fi
420
+
421
+ echo -e " ${BLUE} Running: $PLOT_CMD ${NC} "
422
+ eval $PLOT_CMD
423
+ echo -e " ${GREEN} ✅ $dataset plots generated in $OUTPUT_BASE /plots_$dataset ${NC} "
424
+ else
425
+ echo -e " ${RED} ⚠️ No vLLM summary.json found for $dataset , skipping plots${NC} "
426
+ fi
427
+ }
336
428
337
- echo -e " ${BLUE} Running: $PLOT_CMD ${NC} "
338
- eval $PLOT_CMD
429
+ # Function to generate comparison plots (now just calls individual dataset plots)
430
+ generate_plots () {
431
+ echo -e " ${BLUE} 📈 Generating any remaining comparison plots...${NC} "
432
+
433
+ for dataset in " ${! DATASET_CONFIGS[@]} " ; do
434
+ # Check if plots already exist
435
+ if [[ ! -d " $OUTPUT_BASE /plots_$dataset " ]]; then
436
+ echo -e " ${YELLOW} 📊 Generating missing plots for $dataset ...${NC} "
437
+ generate_dataset_plot " $dataset "
339
438
else
340
- echo -e " ${RED } ⚠️ No vLLM summary.json found for $dataset , skipping plots ${NC} "
439
+ echo -e " ${GREEN } ✅ Plots for $dataset already exist ${NC} "
341
440
fi
342
441
done
343
442
344
- echo -e " ${GREEN} ✅ All plots generated${NC} "
443
+ echo -e " ${GREEN} ✅ All plots verified/ generated${NC} "
345
444
echo " "
346
445
}
347
446
372
471
" mmlu" )
373
472
echo " | MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> " $summary_file "
374
473
;;
375
- " arc" )
376
- echo " | ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> " $summary_file "
474
+ " arc-challenge " )
475
+ echo " | ARC-Challenge | $samples | $samples | 1 (Science) | Scientific Reasoning (Hard) |" >> " $summary_file "
377
476
;;
378
477
" gpqa" )
379
478
echo " | GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> " $summary_file "
385
484
echo " | CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> " $summary_file "
386
485
;;
387
486
" hellaswag" )
388
- echo " | HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> " $summary_file "
487
+ echo " | HellaSwag | $samples | ~100 | ~50 activities | Commonsense NLI |" >> " $summary_file "
389
488
;;
390
489
esac
391
490
done
398
497
399
498
### Accuracy Comparison
400
499
- Router (auto model with reasoning): See research_results.csv
401
- - vLLM Direct (NR mode ): See research_results.csv
402
- - vLLM Direct (NR_REASONING mode ): See research_results.csv
500
+ - vLLM Direct (No Reasoning ): See research_results.csv
501
+ - vLLM Direct (All Reasoning ): See research_results.csv
403
502
404
503
### Token Usage Analysis
405
504
- Average tokens per response by dataset and mode (in research_results.csv)
448
547
449
548
- **Seed**: 42 (for reproducibility)
450
549
- **Router Mode**: Auto model selection with reasoning
451
- - **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning)
550
+ - **vLLM Modes**: No Reasoning and All Reasoning
452
551
- **Sample Strategy**: Stratified sampling per category
453
552
- **Evaluation**: Exact match accuracy and token usage
454
553
462
561
echo -e " ${BLUE} 🚀 Starting comprehensive benchmark...${NC} "
463
562
start_time=$( date +%s)
464
563
465
- # Run benchmarks for all datasets
466
- for dataset in " ${! DATASET_CONFIGS[@]} " ; do
564
+ # Run benchmarks for reasoning-focused datasets (GPQA first for quick feedback)
565
+ DATASET_ORDER=(" gpqa" " truthfulqa" " gsm8k" " aqua-rat" " sciq" " openbookqa" " strategyqa" " drop" " mmlu" )
566
+ dataset_count=0
567
+ total_datasets=${# DATASET_ORDER[@]}
568
+
569
+ for dataset in " ${DATASET_ORDER[@]} " ; do
570
+ # Skip if dataset not configured
571
+ if [[ -z " ${DATASET_CONFIGS[$dataset]} " ]]; then
572
+ echo -e " ${YELLOW} ⚠️ Dataset $dataset not configured, skipping...${NC} "
573
+ continue
574
+ fi
575
+
576
+ dataset_count=$(( dataset_count + 1 ))
577
+ echo -e " ${BLUE} 🚀 Progress: Dataset $dataset_count /$total_datasets - Starting $dataset ${NC} "
467
578
run_dataset_benchmark " $dataset "
579
+ echo -e " ${GREEN} 🎉 Progress: Dataset $dataset_count /$total_datasets - Completed $dataset ${NC} "
580
+ echo -e " ${YELLOW} 📊 Remaining datasets: $(( total_datasets - dataset_count)) ${NC} "
581
+ echo " "
468
582
done
469
583
470
584
# Generate plots
@@ -489,7 +603,16 @@ echo -e "${BLUE}📋 Next Steps:${NC}"
489
603
echo " 1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV "
490
604
echo " 2. 📊 **This run's data**: $OUTPUT_BASE /research_results.csv"
491
605
echo " 3. 📋 Review research summary: $OUTPUT_BASE /RESEARCH_SUMMARY.md"
492
- echo " 4. 📈 Examine plots for visual insights"
606
+ echo " 4. 📈 **View comprehensive charts**:"
607
+ if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
608
+ echo " 📊 Accuracy: $OUTPUT_BASE /accuracy_comparison.png"
609
+ fi
610
+ if [[ -f " $OUTPUT_BASE /token_usage_comparison.png" ]]; then
611
+ echo " 📊 Token Usage: $OUTPUT_BASE /token_usage_comparison.png"
612
+ fi
613
+ if [[ -f " $OUTPUT_BASE /efficiency_analysis.png" ]]; then
614
+ echo " 📊 Efficiency: $OUTPUT_BASE /efficiency_analysis.png"
615
+ fi
493
616
echo " 5. 📄 Analyze detailed CSV files if needed"
494
617
echo " "
495
618
echo -e " ${GREEN} 🎓 Research CSV Format:${NC} "
0 commit comments