Include multi-modal eval in eval scripts (#3133)

jerryzh168 · web-flow · commit f64daacf2f57 · 2025-10-08T18:36:37.000-07:00
Summary:
We added multi-modal quality eval using lmms_eval library
see README for instructions

Test Plan:
Tested locally:
```
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32
```
and
```
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md
@@ -140,12 +140,36 @@ After environment is setup, we can run eval:
 sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
 ```
 
+See https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks for all supported tasks.
+
 Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
-and you don't want to re-run all evals.
+and you don't want to re-run all evals and there is no change to the model checkpoint.
 ```
 sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
 ```
 
+#### Multi-modal Model Quality Eval
+For multi-modal model quality eval, we need to install lmms-eval
+```
+uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+```
+After environment is setup, we can run eval:
+```
+sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32
+```
+
+See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/models/simple for supported model types.
+See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks for supported multi-modal tasks.
+
+Note: larger mm_eval_batch_size could speedup eval but may cause OOM, when that happens, please reduce the number
+
+Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
+and you don't want to re-run all evals and there is no change to model checkpoint.
+```
+sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache
+```
+
+Alternatively, please feel free to use the example scripts directly from llms-eval repo: https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/examples/models to run the evaluation.
 
 #### Summarize results
 After we have finished all evals for each model, we can summarize the results with:
diff --git a/.github/scripts/torchao_model_releases/eval.sh b/.github/scripts/torchao_model_releases/eval.sh
@@ -19,7 +19,10 @@ MODEL_ID_ARRAY=()
 EVAL_TYPE="all"
 # these will be parsed in the other scripts
 BATCH_SIZES="1 256"    # Default for latency eval
+MM_EVAL_BATCH_SIZE=1   # Default batch size for mm quality eval
 TASKS="mmlu"           # Default for quality eval
+MM_TASKS="chartqa"     # Default for multi-modal quality eval (not included in all)
+MODEL_TYPE=""
 USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -50,6 +53,10 @@ while [[ $# -gt 0 ]]; do
       BATCH_SIZES="$1"
       shift
       ;;
+    --mm_eval_batch_size)
+      MM_EVAL_BATCH_SIZE="$2"
+      shift 2
+      ;;
     --tasks)
       shift
       if [[ $# -eq 0 ]]; then
@@ -59,6 +66,24 @@ while [[ $# -gt 0 ]]; do
       TASKS="$1"
       shift
       ;;
+    --model_type)
+      shift
+      if [[ $# -eq 0 ]]; then
+        echo "Error: --model_type requires a value"
+        exit 1
+      fi
+      MODEL_TYPE="$1"
+      shift
+      ;;
+    --mm_tasks)
+      shift
+      if [[ $# -eq 0 ]]; then
+        echo "Error: --mm_tasks requires a value"
+        exit 1
+      fi
+      MM_TASKS="$1"
+      shift
+      ;;
     --use_cache)
       USE_CACHE=true
       shift
@@ -93,6 +118,16 @@ run_quality() {
     sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
   fi
 }
+run_mm_quality() {
+  check_lmms_eval
+  local model_id="$1"
+  echo "run_mm_quality" $model_id $MODEL_TYPE
+  if $USE_CACHE; then
+    sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE --use_cache
+  else
+    sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE
+  fi
+}
 for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
   case "$EVAL_TYPE" in
     memory)
@@ -104,6 +139,9 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
     quality)
       run_quality "$MODEL_ID"
       ;;
+    mm_quality)
+      run_mm_quality "$MODEL_ID"
+      ;;
     all)
       run_quality "$MODEL_ID"
       run_memory "$MODEL_ID"
diff --git a/.github/scripts/torchao_model_releases/eval_env_checks.sh b/.github/scripts/torchao_model_releases/eval_env_checks.sh
@@ -24,3 +24,10 @@ check_lm_eval() {
     exit 1
   fi
 }
+
+check_lmms_eval() {
+  if ! pip show lmms_eval > /dev/null 2>&1; then
+    echo "Error: lmms_eval package is NOT installed. please install with `uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git`" >&2
+    exit 1
+  fi
+}
diff --git a/.github/scripts/torchao_model_releases/summarize_results.sh b/.github/scripts/torchao_model_releases/summarize_results.sh
@@ -51,8 +51,8 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
             PATTERN="pretrained=${MODEL_ID}"
             LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
             if [ -n "$LAST_LINE" ]; then
-                echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE + 1))) ---"
-                tail -n +"$((LAST_LINE + 1))" "$Q_LOG"
+                echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
+                tail -n +"$((LAST_LINE))" "$Q_LOG"
             else
                 echo "Pattern not found in $Q_LOG"
             fi
@@ -61,6 +61,26 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
       echo "--- No quality logs found matching pattern: $QUALITY_LOG_PATTERN"
     fi
 
+    MM_QUALITY_LOG_PATTERN="${SAFE_MODEL_ID}_mm_quality_*.log"
+    # Multi-modal Quality logs (multiple files, one per task)
+    MM_QUALITY_LOGS=( $MM_QUALITY_LOG_PATTERN )
+    if [ -e "${MM_QUALITY_LOGS[0]}" ]; then
+        for Q_LOG in "${MM_QUALITY_LOGS[@]}"; do
+            # find last appearance of pretrained={MODEL_ID} and
+            # extract all lines after that
+            PATTERN="pretrained=${MODEL_ID}"
+            LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
+            if [ -n "$LAST_LINE" ]; then
+                echo "--- Multi-modal Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
+                tail -n +"$((LAST_LINE))" "$Q_LOG"
+            else
+                echo "Pattern not found in $Q_LOG"
+            fi
+      done
+    else
+      echo "--- No quality logs found matching pattern: $MM_QUALITY_LOG_PATTERN"
+    fi
+
     MEMORY_LOG="${SAFE_MODEL_ID}_memory.log"
     if [ -f "$MEMORY_LOG" ]; then
       echo "--- Memory log (last 1 lines) ---"