Skip to content

Commit 0f4f019

Browse files
authored
[CI/Build] Replace lm-eval gsm8k tests with faster implementation (#23002)
Signed-off-by: mgoin <[email protected]>
1 parent a38b8af commit 0f4f019

12 files changed

+476
-3
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -451,13 +451,11 @@ steps:
451451

452452
- label: LM Eval Small Models # 53min
453453
mirror_hardwares: [amdexperimental]
454-
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
455454
source_file_dependencies:
456455
- csrc/
457456
- vllm/model_executor/layers/quantization
458457
commands:
459-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
460-
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
458+
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
461459

462460
- label: OpenAI API correctness
463461
mirror_hardwares: [amdexperimental]

tests/evals/gsm8k/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# GSM8K Accuracy Evaluation
2+
3+
This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
4+
5+
## Usage
6+
7+
### Run tests with pytest (like buildkite)
8+
9+
```bash
10+
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
11+
--config-list-file=configs/models-small.txt \
12+
--tp-size=1
13+
```
14+
15+
### Run standalone evaluation script
16+
17+
```bash
18+
# Start vLLM server first
19+
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
20+
21+
# Run evaluation
22+
python tests/gsm8k/gsm8k_eval.py --port 8000
23+
```
24+
25+
## Configuration Format
26+
27+
Model configs in `configs/` directory use this YAML format:
28+
29+
```yaml
30+
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
31+
accuracy_threshold: 0.54 # Minimum expected accuracy
32+
num_questions: 1319 # Number of questions (default: full test set)
33+
num_fewshot: 5 # Few-shot examples from train set
34+
max_model_len: 4096 # Model context length
35+
```

tests/evals/gsm8k/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
2+
accuracy_threshold: 0.74
3+
num_questions: 1319
4+
num_fewshot: 5
5+
max_model_len: 4096
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
2+
accuracy_threshold: 0.31
3+
num_questions: 1319
4+
num_fewshot: 5
5+
max_model_len: 4096
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
2+
accuracy_threshold: 0.45
3+
num_questions: 1319
4+
num_fewshot: 5
5+
max_model_len: 4096
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
2+
accuracy_threshold: 0.60
3+
num_questions: 1319
4+
num_fewshot: 5
5+
max_model_len: 4096
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model_name: "Qwen/Qwen3-0.6B-FP8"
2+
accuracy_threshold: 0.375
3+
num_questions: 1319
4+
num_fewshot: 5
5+
max_model_len: 4096
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Qwen3-0.6B-FP8.yaml
2+
Llama-3.2-1B-Instruct-INT8-CT.yaml
3+
Llama-3-8B-Instruct-nonuniform-CT.yaml
4+
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
5+
Qwen1.5-MoE-W4A16-CT.yaml

tests/evals/gsm8k/conftest.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
from pathlib import Path
5+
6+
7+
def pytest_addoption(parser):
8+
"""Add custom command line options."""
9+
parser.addoption("--config-list-file",
10+
default="configs/models-small.txt",
11+
help="File containing list of config files to test")
12+
parser.addoption("--tp-size",
13+
default=1,
14+
type=int,
15+
help="Tensor parallel size")
16+
17+
18+
def pytest_generate_tests(metafunc):
19+
"""Generate test parameters from config files."""
20+
if "config_filename" in metafunc.fixturenames:
21+
config_list_file = metafunc.config.getoption("--config-list-file")
22+
tp_size = metafunc.config.getoption("--tp-size")
23+
24+
# Handle both relative and absolute paths
25+
config_list_path = Path(config_list_file)
26+
if not config_list_path.is_absolute():
27+
# If relative, try relative to test directory first
28+
test_dir_path = Path(__file__).parent / config_list_file
29+
if test_dir_path.exists():
30+
config_list_path = test_dir_path
31+
else:
32+
# Try relative to current working directory
33+
config_list_path = Path.cwd() / config_list_file
34+
35+
print(f"Looking for config list at: {config_list_path}")
36+
37+
config_files = []
38+
if config_list_path.exists():
39+
# Determine config directory (same directory as the list file)
40+
config_dir = config_list_path.parent
41+
42+
with open(config_list_path) as f:
43+
for line in f:
44+
line = line.strip()
45+
if line and not line.startswith("#"):
46+
config_path = config_dir / line
47+
print(f"Checking config file: {config_path}")
48+
if config_path.exists():
49+
config_files.append(config_path)
50+
print(f" ✓ Found: {config_path}")
51+
else:
52+
print(f" ✗ Missing: {config_path}")
53+
else:
54+
print(f"Config list file not found: {config_list_path}")
55+
56+
# Generate test parameters
57+
if config_files:
58+
metafunc.parametrize(["config_filename", "tp_size"],
59+
[(config_file, int(tp_size))
60+
for config_file in config_files],
61+
ids=[
62+
f"{config_file.stem}-tp{tp_size}"
63+
for config_file in config_files
64+
])
65+
else:
66+
print("No config files found, test will be skipped")

0 commit comments

Comments
 (0)