@@ -12,6 +12,7 @@ ROUTER_ENDPOINT="http://127.0.0.1:8801/v1"
1212VLLM_ENDPOINT=" http://127.0.0.1:8000/v1"
1313VLLM_MODEL=" " # Will be auto-detected from endpoint if not specified
1414ROUTER_MODEL=" auto"
15+ CONCURRENT_REQUESTS=8
1516OUTPUT_BASE=" results/comprehensive_research_$( date +%Y%m%d_%H%M%S) "
1617
1718# Parse command line arguments
@@ -105,12 +106,28 @@ PERSISTENT_RESEARCH_CSV="results/research_results_master.csv"
105106# Dataset configurations (dataset_name:samples_per_category)
106107# Balanced for statistical significance vs runtime
107108declare -A DATASET_CONFIGS=(
108- [" mmlu" ]=10 # 57 subjects × 10 = 570 samples
109- [" arc" ]=15 # 1 category × 15 = 15 samples
110- [" gpqa" ]=20 # 1 category × 20 = 20 samples
111- [" truthfulqa" ]=15 # 1 category × 15 = 15 samples
112- [" commonsenseqa" ]=20 # 1 category × 20 = 20 samples
113- [" hellaswag" ]=8 # ~50 activities × 8 = ~400 samples
109+ # Core proven datasets
110+ [" gpqa" ]=20 # 1 category × 20 = 20 samples - OUTSTANDING reasoning differentiation
111+ [" mmlu" ]=10 # 57 subjects × 10 = 570 samples - EXCELLENT reasoning differentiation
112+ [" truthfulqa" ]=15 # Truthfulness evaluation - some reasoning differentiation (60% → 73.3%)
113+
114+ # Mathematical reasoning datasets
115+ # ["math"]=15 # Competition mathematics - DISABLED: Dataset not available on HF Hub
116+ [" gsm8k" ]=25 # Elementary math word problems - EXPECTED good reasoning differentiation
117+ [" aqua-rat" ]=20 # Algebraic word problems with rationales - EXPECTED good differentiation
118+
119+ # Multi-step reasoning datasets
120+ [" drop" ]=20 # Reading comprehension with discrete reasoning - EXPECTED excellent differentiation
121+ [" strategyqa" ]=20 # Multi-step implicit reasoning - EXPECTED good differentiation
122+
123+ # Scientific reasoning datasets
124+ [" sciq" ]=25 # Science questions requiring reasoning - EXPECTED moderate differentiation
125+ [" openbookqa" ]=20 # Elementary science with fact reasoning - EXPECTED moderate differentiation
126+
127+ # Disabled datasets with poor reasoning differentiation:
128+ # ["arc-challenge"]=15 # 100% accuracy across all modes, minimal benefit
129+ # ["commonsenseqa"]=20 # Same accuracy across modes, small token difference
130+ # ["hellaswag"]=2 # Minimal differentiation, not reasoning-focused
114131)
115132
116133echo -e " ${BLUE} 🔬 COMPREHENSIVE MULTI-DATASET BENCHMARK FOR RESEARCH${NC} "
@@ -136,14 +153,17 @@ source "$VENV_PATH/bin/activate"
136153mkdir -p " $OUTPUT_BASE "
137154mkdir -p " $( dirname " $PERSISTENT_RESEARCH_CSV " ) "
138155
139- # Initialize persistent research results CSV (create header only if file doesn't exist)
140- if [[ ! -f " $PERSISTENT_RESEARCH_CSV " ]]; then
141- echo " Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > " $PERSISTENT_RESEARCH_CSV "
142- echo -e " ${GREEN} 📊 Created new master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
143- else
144- echo -e " ${BLUE} 📊 Using existing master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
156+ # Backup and clear master research CSV for fresh results
157+ if [[ -f " $PERSISTENT_RESEARCH_CSV " ]]; then
158+ BACKUP_CSV=" ${PERSISTENT_RESEARCH_CSV} .backup_$( date +%Y%m%d_%H%M%S) "
159+ cp " $PERSISTENT_RESEARCH_CSV " " $BACKUP_CSV "
160+ echo -e " ${GREEN} 📊 Backed up existing master CSV to: $BACKUP_CSV ${NC} "
145161fi
146162
163+ # Create fresh master research CSV with header only
164+ echo " Dataset,Mode,Model,Accuracy,Avg_Latency_ms,Avg_Total_Tokens,Sample_Count,Timestamp" > " $PERSISTENT_RESEARCH_CSV "
165+ echo -e " ${GREEN} 📊 Created fresh master research CSV: $PERSISTENT_RESEARCH_CSV ${NC} "
166+
147167# Also create a timestamped copy for this run
148168RESEARCH_CSV=" $OUTPUT_BASE /research_results.csv"
149169cp " $PERSISTENT_RESEARCH_CSV " " $RESEARCH_CSV "
@@ -225,9 +245,12 @@ try:
225245 model_name = '$VLLM_MODEL '
226246
227247 # For vLLM, we might have multiple modes (NR, NR_REASONING)
228- if '$mode ' == 'vllm' and 'mode' in df.columns:
229- for mode_type in df['mode'].unique():
230- mode_df = df[df['mode'] == mode_type]
248+ # Check both 'mode' and 'mode_label' columns for mode information
249+ if '$mode ' == 'vllm' and ('mode' in df.columns or 'mode_label' in df.columns):
250+ # Use mode_label if available (more descriptive), otherwise use mode
251+ mode_column = 'mode_label' if 'mode_label' in df.columns else 'mode'
252+ for mode_type in df[mode_column].unique():
253+ mode_df = df[df[mode_column] == mode_type]
231254
232255 # Recalculate metrics for this specific mode using correct column names
233256 if 'is_correct' in mode_df.columns:
@@ -253,7 +276,17 @@ try:
253276
254277 mode_samples = len(mode_df)
255278
256- csv_line = f'$dataset ,vLLM_{mode_type},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp '
279+ # Map technical mode names to descriptive names
280+ if mode_type == 'VLLM_NR':
281+ display_mode = 'vLLM_No_Reasoning'
282+ elif mode_type == 'VLLM_NR_REASONING':
283+ display_mode = 'vLLM_All_Reasoning'
284+ elif mode_type == 'VLLM_XC':
285+ display_mode = 'vLLM_CoT'
286+ else:
287+ display_mode = mode_type # Use the mode_label as-is if not recognized
288+
289+ csv_line = f'$dataset ,{display_mode},{model_name},{mode_accuracy:.3f},{mode_latency:.1f},{mode_tokens:.1f},{mode_samples},$timestamp '
257290 print(f' 📝 Writing to CSV: {csv_line}', file=sys.stderr)
258291 print(csv_line)
259292 else:
@@ -283,14 +316,17 @@ run_dataset_benchmark() {
283316
284317 echo -e " ${GREEN} 📊 Benchmarking $dataset dataset ($samples samples per category)...${NC} "
285318
286- # Router benchmark
319+ # Router benchmark (pass vLLM info for consistent token calculation)
287320 echo -e " ${YELLOW} 🤖 Running router evaluation...${NC} "
288321 python3 -m vllm_semantic_router_bench.router_reason_bench_multi_dataset \
289322 --dataset " $dataset " \
290323 --samples-per-category " $samples " \
291324 --run-router \
292325 --router-endpoint " $ROUTER_ENDPOINT " \
293326 --router-models " $ROUTER_MODEL " \
327+ --vllm-endpoint " $VLLM_ENDPOINT " \
328+ --vllm-models " $VLLM_MODEL " \
329+ --concurrent-requests " $CONCURRENT_REQUESTS " \
294330 --output-dir " $OUTPUT_BASE /router_$dataset " \
295331 --seed 42
296332
@@ -307,41 +343,104 @@ run_dataset_benchmark() {
307343 --vllm-models " $VLLM_MODEL " \
308344 --vllm-exec-modes NR NR_REASONING \
309345 --output-dir " $OUTPUT_BASE /vllm_$dataset " \
346+ --concurrent-requests " $CONCURRENT_REQUESTS " \
310347 --seed 42
311348
312349 # Extract and save vLLM metrics immediately
313350 extract_and_save_metrics " $dataset " " vllm" " $OUTPUT_BASE /vllm_$dataset "
314351
315- echo -e " ${GREEN} ✅ Completed $dataset benchmark${NC} "
352+ # Generate updated comprehensive plots for current dataset
353+ echo -e " ${BLUE} 📈 Updating comprehensive plots with $dataset results...${NC} "
354+ generate_comprehensive_plot " $dataset "
355+
356+ echo -e " ${GREEN} ✅ Completed $dataset benchmark and comprehensive plots updated${NC} "
357+ echo -e " ${GREEN} 📈 CSV data updated in: $PERSISTENT_RESEARCH_CSV ${NC} "
316358 echo " "
317359}
318360
319- # Function to generate comparison plots
320- generate_plots () {
321- echo -e " ${BLUE} 📈 Generating comparison plots... ${NC} "
361+ # Function to generate comprehensive plot with all completed datasets (called after each dataset completes)
362+ generate_comprehensive_plot () {
363+ local current_dataset= $1
322364
323- for dataset in " ${! DATASET_CONFIGS[@]} " ; do
324- echo -e " ${YELLOW} 📊 Plotting $dataset results...${NC} "
365+ if [[ -n " $current_dataset " ]]; then
366+ echo -e " ${YELLOW} 📊 Generating plot for current dataset: $current_dataset ...${NC} "
367+ else
368+ echo -e " ${YELLOW} 📊 Generating comprehensive plot with all completed datasets...${NC} "
369+ fi
370+
371+ # Use the plot_comprehensive_results.py script to generate updated charts
372+ if [[ -f " plot_comprehensive_results.py" ]]; then
373+ echo -e " ${BLUE} Running comprehensive plotting script...${NC} "
374+ # Use the current run's CSV instead of the master CSV to show only this run's results
375+ PLOT_CMD=" python3 plot_comprehensive_results.py \
376+ --csv \" $RESEARCH_CSV \" \
377+ --output-dir \" $OUTPUT_BASE \" \
378+ --model-filter \" $VLLM_MODEL \" "
379+
380+ # Add dataset filter if specified
381+ if [[ -n " $current_dataset " ]]; then
382+ PLOT_CMD=" $PLOT_CMD --dataset-filter \" $current_dataset \" "
383+ fi
384+
385+ eval $PLOT_CMD
325386
326- # Find the summary.json files
327- ROUTER_SUMMARY=$( find " $OUTPUT_BASE /router_$dataset " -name " summary.json" -type f | head -1)
328- VLLM_SUMMARY=$( find " $OUTPUT_BASE /vllm_$dataset " -name " summary.json" -type f | head -1)
387+ echo -e " ${GREEN} ✅ Comprehensive plots updated in $OUTPUT_BASE ${NC} "
388+
389+ # Print actual paths of generated charts
390+ if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
391+ echo -e " ${GREEN} 📊 Accuracy Chart: $OUTPUT_BASE /accuracy_comparison.png${NC} "
392+ fi
393+ if [[ -f " $OUTPUT_BASE /token_usage_comparison.png" ]]; then
394+ echo -e " ${GREEN} 📊 Token Usage Chart: $OUTPUT_BASE /token_usage_comparison.png${NC} "
395+ fi
396+ if [[ -f " $OUTPUT_BASE /efficiency_analysis.png" ]]; then
397+ echo -e " ${GREEN} 📊 Efficiency Chart: $OUTPUT_BASE /efficiency_analysis.png${NC} "
398+ fi
399+ else
400+ echo -e " ${RED} ⚠️ plot_comprehensive_results.py not found, skipping comprehensive plots${NC} "
401+ fi
402+ }
403+
404+ # Function to generate plot for a single dataset (kept for compatibility)
405+ generate_dataset_plot () {
406+ local dataset=$1
407+
408+ echo -e " ${YELLOW} 📊 Plotting $dataset results...${NC} "
409+
410+ # Find the summary.json files
411+ ROUTER_SUMMARY=$( find " $OUTPUT_BASE /router_$dataset " -name " summary.json" -type f | head -1)
412+ VLLM_SUMMARY=$( find " $OUTPUT_BASE /vllm_$dataset " -name " summary.json" -type f | head -1)
329413
330- if [[ -f " $VLLM_SUMMARY " ]]; then
331- PLOT_CMD=" python3 -m vllm_semantic_router_bench.bench_plot --summary \" $VLLM_SUMMARY \" --out-dir \" $OUTPUT_BASE /plots_$dataset \" "
414+ if [[ -f " $VLLM_SUMMARY " ]]; then
415+ PLOT_CMD=" python3 -m vllm_semantic_router_bench.bench_plot --summary \" $VLLM_SUMMARY \" --out-dir \" $OUTPUT_BASE /plots_$dataset \" "
332416
333- if [[ -f " $ROUTER_SUMMARY " ]]; then
334- PLOT_CMD=" $PLOT_CMD --router-summary \" $ROUTER_SUMMARY \" "
335- fi
417+ if [[ -f " $ROUTER_SUMMARY " ]]; then
418+ PLOT_CMD=" $PLOT_CMD --router-summary \" $ROUTER_SUMMARY \" "
419+ fi
420+
421+ echo -e " ${BLUE} Running: $PLOT_CMD ${NC} "
422+ eval $PLOT_CMD
423+ echo -e " ${GREEN} ✅ $dataset plots generated in $OUTPUT_BASE /plots_$dataset ${NC} "
424+ else
425+ echo -e " ${RED} ⚠️ No vLLM summary.json found for $dataset , skipping plots${NC} "
426+ fi
427+ }
336428
337- echo -e " ${BLUE} Running: $PLOT_CMD ${NC} "
338- eval $PLOT_CMD
429+ # Function to generate comparison plots (now just calls individual dataset plots)
430+ generate_plots () {
431+ echo -e " ${BLUE} 📈 Generating any remaining comparison plots...${NC} "
432+
433+ for dataset in " ${! DATASET_CONFIGS[@]} " ; do
434+ # Check if plots already exist
435+ if [[ ! -d " $OUTPUT_BASE /plots_$dataset " ]]; then
436+ echo -e " ${YELLOW} 📊 Generating missing plots for $dataset ...${NC} "
437+ generate_dataset_plot " $dataset "
339438 else
340- echo -e " ${RED } ⚠️ No vLLM summary.json found for $dataset , skipping plots ${NC} "
439+ echo -e " ${GREEN } ✅ Plots for $dataset already exist ${NC} "
341440 fi
342441 done
343442
344- echo -e " ${GREEN} ✅ All plots generated${NC} "
443+ echo -e " ${GREEN} ✅ All plots verified/ generated${NC} "
345444 echo " "
346445}
347446
372471 " mmlu" )
373472 echo " | MMLU | $samples | ~570 | 57 subjects | Academic Knowledge |" >> " $summary_file "
374473 ;;
375- " arc" )
376- echo " | ARC | $samples | $samples | 1 (Science) | Scientific Reasoning |" >> " $summary_file "
474+ " arc-challenge " )
475+ echo " | ARC-Challenge | $samples | $samples | 1 (Science) | Scientific Reasoning (Hard) |" >> " $summary_file "
377476 ;;
378477 " gpqa" )
379478 echo " | GPQA | $samples | $samples | 1 (Graduate) | Graduate-level Q&A |" >> " $summary_file "
385484 echo " | CommonsenseQA | $samples | $samples | 1 (Common Sense) | Commonsense Reasoning |" >> " $summary_file "
386485 ;;
387486 " hellaswag" )
388- echo " | HellaSwag | $samples | ~400 | ~50 activities | Commonsense NLI |" >> " $summary_file "
487+ echo " | HellaSwag | $samples | ~100 | ~50 activities | Commonsense NLI |" >> " $summary_file "
389488 ;;
390489 esac
391490 done
398497
399498### Accuracy Comparison
400499- Router (auto model with reasoning): See research_results.csv
401- - vLLM Direct (NR mode ): See research_results.csv
402- - vLLM Direct (NR_REASONING mode ): See research_results.csv
500+ - vLLM Direct (No Reasoning ): See research_results.csv
501+ - vLLM Direct (All Reasoning ): See research_results.csv
403502
404503### Token Usage Analysis
405504- Average tokens per response by dataset and mode (in research_results.csv)
448547
449548- **Seed**: 42 (for reproducibility)
450549- **Router Mode**: Auto model selection with reasoning
451- - **vLLM Modes**: NR (neutral) and NR_REASONING (with reasoning)
550+ - **vLLM Modes**: No Reasoning and All Reasoning
452551- **Sample Strategy**: Stratified sampling per category
453552- **Evaluation**: Exact match accuracy and token usage
454553
462561echo -e " ${BLUE} 🚀 Starting comprehensive benchmark...${NC} "
463562start_time=$( date +%s)
464563
465- # Run benchmarks for all datasets
466- for dataset in " ${! DATASET_CONFIGS[@]} " ; do
564+ # Run benchmarks for reasoning-focused datasets (GPQA first for quick feedback)
565+ DATASET_ORDER=(" gpqa" " truthfulqa" " gsm8k" " aqua-rat" " sciq" " openbookqa" " strategyqa" " drop" " mmlu" )
566+ dataset_count=0
567+ total_datasets=${# DATASET_ORDER[@]}
568+
569+ for dataset in " ${DATASET_ORDER[@]} " ; do
570+ # Skip if dataset not configured
571+ if [[ -z " ${DATASET_CONFIGS[$dataset]} " ]]; then
572+ echo -e " ${YELLOW} ⚠️ Dataset $dataset not configured, skipping...${NC} "
573+ continue
574+ fi
575+
576+ dataset_count=$(( dataset_count + 1 ))
577+ echo -e " ${BLUE} 🚀 Progress: Dataset $dataset_count /$total_datasets - Starting $dataset ${NC} "
467578 run_dataset_benchmark " $dataset "
579+ echo -e " ${GREEN} 🎉 Progress: Dataset $dataset_count /$total_datasets - Completed $dataset ${NC} "
580+ echo -e " ${YELLOW} 📊 Remaining datasets: $(( total_datasets - dataset_count)) ${NC} "
581+ echo " "
468582done
469583
470584# Generate plots
@@ -489,7 +603,16 @@ echo -e "${BLUE}📋 Next Steps:${NC}"
489603echo " 1. 📊 **Master research data**: $PERSISTENT_RESEARCH_CSV "
490604echo " 2. 📊 **This run's data**: $OUTPUT_BASE /research_results.csv"
491605echo " 3. 📋 Review research summary: $OUTPUT_BASE /RESEARCH_SUMMARY.md"
492- echo " 4. 📈 Examine plots for visual insights"
606+ echo " 4. 📈 **View comprehensive charts**:"
607+ if [[ -f " $OUTPUT_BASE /accuracy_comparison.png" ]]; then
608+ echo " 📊 Accuracy: $OUTPUT_BASE /accuracy_comparison.png"
609+ fi
610+ if [[ -f " $OUTPUT_BASE /token_usage_comparison.png" ]]; then
611+ echo " 📊 Token Usage: $OUTPUT_BASE /token_usage_comparison.png"
612+ fi
613+ if [[ -f " $OUTPUT_BASE /efficiency_analysis.png" ]]; then
614+ echo " 📊 Efficiency: $OUTPUT_BASE /efficiency_analysis.png"
615+ fi
493616echo " 5. 📄 Analyze detailed CSV files if needed"
494617echo " "
495618echo -e " ${GREEN} 🎓 Research CSV Format:${NC} "
0 commit comments