Skip to content

Commit f64daac

Browse files
authored
Include multi-modal eval in eval scripts (#3133)
Summary: We added multi-modal quality eval using lmms_eval library see README for instructions Test Plan: Tested locally: ``` sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 ``` and ``` sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache ``` Reviewers: Subscribers: Tasks: Tags:
1 parent c7b8e13 commit f64daac

File tree

4 files changed

+92
-3
lines changed

4 files changed

+92
-3
lines changed

.github/scripts/torchao_model_releases/README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,36 @@ After environment is setup, we can run eval:
140140
sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
141141
```
142142

143+
See https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks for all supported tasks.
144+
143145
Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
144-
and you don't want to re-run all evals.
146+
and you don't want to re-run all evals and there is no change to the model checkpoint.
145147
```
146148
sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
147149
```
148150

151+
#### Multi-modal Model Quality Eval
152+
For multi-modal model quality eval, we need to install lmms-eval
153+
```
154+
uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
155+
```
156+
After environment is setup, we can run eval:
157+
```
158+
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32
159+
```
160+
161+
See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/models/simple for supported model types.
162+
See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks for supported multi-modal tasks.
163+
164+
Note: larger mm_eval_batch_size could speedup eval but may cause OOM, when that happens, please reduce the number
165+
166+
Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
167+
and you don't want to re-run all evals and there is no change to model checkpoint.
168+
```
169+
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache
170+
```
171+
172+
Alternatively, please feel free to use the example scripts directly from llms-eval repo: https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/examples/models to run the evaluation.
149173

150174
#### Summarize results
151175
After we have finished all evals for each model, we can summarize the results with:

.github/scripts/torchao_model_releases/eval.sh

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ MODEL_ID_ARRAY=()
1919
EVAL_TYPE="all"
2020
# these will be parsed in the other scripts
2121
BATCH_SIZES="1 256" # Default for latency eval
22+
MM_EVAL_BATCH_SIZE=1 # Default batch size for mm quality eval
2223
TASKS="mmlu" # Default for quality eval
24+
MM_TASKS="chartqa" # Default for multi-modal quality eval (not included in all)
25+
MODEL_TYPE=""
2326
USE_CACHE=false # default: do not use cache
2427
# Parse arguments
2528
while [[ $# -gt 0 ]]; do
@@ -50,6 +53,10 @@ while [[ $# -gt 0 ]]; do
5053
BATCH_SIZES="$1"
5154
shift
5255
;;
56+
--mm_eval_batch_size)
57+
MM_EVAL_BATCH_SIZE="$2"
58+
shift 2
59+
;;
5360
--tasks)
5461
shift
5562
if [[ $# -eq 0 ]]; then
@@ -59,6 +66,24 @@ while [[ $# -gt 0 ]]; do
5966
TASKS="$1"
6067
shift
6168
;;
69+
--model_type)
70+
shift
71+
if [[ $# -eq 0 ]]; then
72+
echo "Error: --model_type requires a value"
73+
exit 1
74+
fi
75+
MODEL_TYPE="$1"
76+
shift
77+
;;
78+
--mm_tasks)
79+
shift
80+
if [[ $# -eq 0 ]]; then
81+
echo "Error: --mm_tasks requires a value"
82+
exit 1
83+
fi
84+
MM_TASKS="$1"
85+
shift
86+
;;
6287
--use_cache)
6388
USE_CACHE=true
6489
shift
@@ -93,6 +118,16 @@ run_quality() {
93118
sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
94119
fi
95120
}
121+
run_mm_quality() {
122+
check_lmms_eval
123+
local model_id="$1"
124+
echo "run_mm_quality" $model_id $MODEL_TYPE
125+
if $USE_CACHE; then
126+
sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE --use_cache
127+
else
128+
sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE
129+
fi
130+
}
96131
for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
97132
case "$EVAL_TYPE" in
98133
memory)
@@ -104,6 +139,9 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
104139
quality)
105140
run_quality "$MODEL_ID"
106141
;;
142+
mm_quality)
143+
run_mm_quality "$MODEL_ID"
144+
;;
107145
all)
108146
run_quality "$MODEL_ID"
109147
run_memory "$MODEL_ID"

.github/scripts/torchao_model_releases/eval_env_checks.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,10 @@ check_lm_eval() {
2424
exit 1
2525
fi
2626
}
27+
28+
check_lmms_eval() {
29+
if ! pip show lmms_eval > /dev/null 2>&1; then
30+
echo "Error: lmms_eval package is NOT installed. please install with `uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git`" >&2
31+
exit 1
32+
fi
33+
}

.github/scripts/torchao_model_releases/summarize_results.sh

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
5151
PATTERN="pretrained=${MODEL_ID}"
5252
LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
5353
if [ -n "$LAST_LINE" ]; then
54-
echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE + 1))) ---"
55-
tail -n +"$((LAST_LINE + 1))" "$Q_LOG"
54+
echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
55+
tail -n +"$((LAST_LINE))" "$Q_LOG"
5656
else
5757
echo "Pattern not found in $Q_LOG"
5858
fi
@@ -61,6 +61,26 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
6161
echo "--- No quality logs found matching pattern: $QUALITY_LOG_PATTERN"
6262
fi
6363

64+
MM_QUALITY_LOG_PATTERN="${SAFE_MODEL_ID}_mm_quality_*.log"
65+
# Multi-modal Quality logs (multiple files, one per task)
66+
MM_QUALITY_LOGS=( $MM_QUALITY_LOG_PATTERN )
67+
if [ -e "${MM_QUALITY_LOGS[0]}" ]; then
68+
for Q_LOG in "${MM_QUALITY_LOGS[@]}"; do
69+
# find last appearance of pretrained={MODEL_ID} and
70+
# extract all lines after that
71+
PATTERN="pretrained=${MODEL_ID}"
72+
LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
73+
if [ -n "$LAST_LINE" ]; then
74+
echo "--- Multi-modal Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
75+
tail -n +"$((LAST_LINE))" "$Q_LOG"
76+
else
77+
echo "Pattern not found in $Q_LOG"
78+
fi
79+
done
80+
else
81+
echo "--- No quality logs found matching pattern: $MM_QUALITY_LOG_PATTERN"
82+
fi
83+
6484
MEMORY_LOG="${SAFE_MODEL_ID}_memory.log"
6585
if [ -f "$MEMORY_LOG" ]; then
6686
echo "--- Memory log (last 1 lines) ---"

0 commit comments

Comments
 (0)