Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4e69ecc
Replace os.environ with monkeypatch in test suite
t-sibiraj Mar 9, 2025
0f9ba59
Replace os.environ with monkeypatch in test suite
t-sibiraj Mar 9, 2025
44c73bb
Update tests/basic_correctness/test_cumem.py
t-sibiraj Mar 9, 2025
5f28f2d
fix: Use `raising=True` in `monkeypatch.delenv` for safer environment…
t-sibiraj Mar 9, 2025
8436a5f
doing a signed commit
t-sibiraj Mar 12, 2025
b791b51
remove unnecessary spacing
t-sibiraj Mar 12, 2025
6971e83
Intentionally added and removed a space to test this commit, so I cou…
t-sibiraj Mar 12, 2025
5556ee1
add type annotations for monkeypatch in the function header and add …
t-sibiraj Mar 12, 2025
fcd4fac
changed the order of the parameter in function header
t-sibiraj Mar 12, 2025
1df0770
Resolved merge conflict
t-sibiraj Mar 12, 2025
d3b91d8
remove unnessary arguements passed to function
t-sibiraj Mar 13, 2025
86f42e8
add necesssary arguements to the check_full_graph_support function
t-sibiraj Mar 13, 2025
939200d
reverted back to use os.environ since pyttest cannot be passed to a m…
t-sibiraj Mar 13, 2025
c4cdf83
pre commit changes
t-sibiraj Mar 13, 2025
d11e7e6
add missing monkeypatch type annotations in function header
t-sibiraj Mar 14, 2025
ebff29c
To resolve merge conflict
t-sibiraj Mar 14, 2025
83ca30b
move monkeypatch.setenv within monkeypatch.context
t-sibiraj Mar 15, 2025
34bc647
Merge branch 'main' into replace-os-environ-monkeypatch
t-sibiraj Mar 15, 2025
7139925
Apply patch before 4932bcd
t-sibiraj Mar 15, 2025
48b9bf4
fix: correct tests
aarnphm Mar 15, 2025
bb8540c
fix: correct types
aarnphm Mar 15, 2025
7430fc6
fix: ignore otel imports mypy
aarnphm Mar 15, 2025
9191625
fix: types
aarnphm Mar 15, 2025
e324fb2
fix: distributed tests with dispatch
aarnphm Mar 15, 2025
5266dff
chore: fix tests
aarnphm Mar 16, 2025
a46f271
fix: precommit error
aarnphm Mar 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ steps:
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py

- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
Expand Down
115 changes: 63 additions & 52 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
Expand All @@ -63,31 +64,33 @@ def test_models(
pytest.skip(
f"{backend} does not support gemma2 with full context length.")

os.environ["VLLM_ATTENTION_BACKEND"] = backend
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)

# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)


@multi_gpu_test(num_gpus=2)
Expand All @@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
vllm_runner,
example_prompts,
Expand All @@ -116,34 +120,41 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")

if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend

dtype = "half"
max_tokens = 5

# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
with monkeypatch.context() as monkeypatch_context:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to add the whole block within this context manager...

if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test Ray Compiled Graph
monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")

if attention_backend:
monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)

dtype = "half"
max_tokens = 5

# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method
# (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
Loading