Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
67fe9c7
wip
abf149 Oct 7, 2024
de7e6c2
conftest cross-merge
abf149 Oct 7, 2024
3ed32e2
fallback test cross-merge
abf149 Oct 7, 2024
fe12a95
added assert which fails when beam search is performed with multi-ste…
abf149 Oct 7, 2024
b2de4aa
removed best_of argument from conftest
abf149 Oct 7, 2024
9e0a5d7
fewer test cases
abf149 Oct 7, 2024
1ac45ed
set up test
abf149 Oct 7, 2024
c22f5cd
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 9, 2024
9cd6b2e
SchedulerConfig allows multi-step override; scheduler updates override
abf149 Oct 9, 2024
f8e3d50
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 9, 2024
50dd68a
added force_single_step to ExecuteModelRequest
abf149 Oct 9, 2024
5aaff1d
format
abf149 Oct 9, 2024
68d5334
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 10, 2024
350f990
passing tests; format
abf149 Oct 10, 2024
b8c5a9a
formatting; passing eager mode & beam search
abf149 Oct 10, 2024
9f781a3
refactor & format
abf149 Oct 10, 2024
28e6303
test cases
abf149 Oct 10, 2024
7eb4f59
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 10, 2024
7852355
formatting
abf149 Oct 10, 2024
a9b2d01
checkout
abf149 Oct 10, 2024
154f3c2
refactor
abf149 Oct 10, 2024
9fea4e5
refactor; checkout
abf149 Oct 10, 2024
7d3a479
format
abf149 Oct 10, 2024
2c1afdd
refactor; format
abf149 Oct 10, 2024
16763d7
refactor
abf149 Oct 10, 2024
23c3538
comment
abf149 Oct 10, 2024
bb431a2
Update vllm/core/scheduler.py
afeldman-nm Oct 10, 2024
24f0fc1
wip async
abf149 Oct 11, 2024
2b5a60c
Merge branch 'afeldman-nm/multi_step_best_of' of https://github.com/n…
abf149 Oct 11, 2024
4010bd5
branch merge
abf149 Oct 11, 2024
a017fa9
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 11, 2024
f8218d4
Merge branch 'multi_step_best_of' into multi_step_best_of_impl
abf149 Oct 11, 2024
d1c80a7
wip
abf149 Oct 11, 2024
b90ed74
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 11, 2024
1139b5e
Merge branch 'multi_step_best_of' into multi_step_best_of_impl
abf149 Oct 11, 2024
80f9b4a
async fallback tests passing
abf149 Oct 11, 2024
3d453c1
formatting
abf149 Oct 11, 2024
e66b669
logprobs test case
abf149 Oct 11, 2024
3b3bae7
refactoring
abf149 Oct 11, 2024
8773fe3
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 13, 2024
d6b8a4e
wip
abf149 Oct 14, 2024
e3ddf92
wip
abf149 Oct 14, 2024
684614a
wip
abf149 Oct 14, 2024
f8bc7cf
Merge branch 'main' into multi_step_best_of
abf149 Oct 14, 2024
892cb3b
refactoring scheduling multi-step field names
abf149 Oct 14, 2024
ab1d499
small fixes; formatting
abf149 Oct 14, 2024
489dcd0
alternate approach to tracking best_of > 1 requests
abf149 Oct 14, 2024
6845404
small fix; format
abf149 Oct 14, 2024
84c47f6
formatting
abf149 Oct 14, 2024
d1612b9
test cleanup
abf149 Oct 14, 2024
af9ec9f
test disabling async output; format
abf149 Oct 14, 2024
718618f
refactor
abf149 Oct 14, 2024
8e4d2bc
Update vllm/config.py
afeldman-nm Oct 14, 2024
6aac776
scheduler num_lookahead_slots returns 0 if multi-step is disabled
abf149 Oct 14, 2024
ac72a50
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 14, 2024
4b707c7
upstream merge; format
abf149 Oct 14, 2024
22ab785
Merge branch 'afeldman-nm/multi_step_best_of' of https://github.com/n…
abf149 Oct 14, 2024
c964126
formatting
abf149 Oct 14, 2024
493ef31
different approach to num_lookahead_tokens
abf149 Oct 14, 2024
f2b226d
refactor
abf149 Oct 14, 2024
6ace01a
first pass at refactor
abf149 Oct 14, 2024
a067605
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 14, 2024
9b7e647
Merge branch 'multi_step_best_of' into multi_step_best_of_merge
abf149 Oct 14, 2024
2f8b889
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 15, 2024
c5b6983
Merge branch 'main' into multi_step_best_of_dbg
abf149 Oct 16, 2024
f9b8d55
NOT_GIVEN defaults
abf149 Oct 16, 2024
5c9ffe4
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 16, 2024
109f84c
test bug fixed
abf149 Oct 16, 2024
a89b776
merge
abf149 Oct 18, 2024
4fa817e
Merge branch 'main' into multi_step_best_of_merge
abf149 Oct 21, 2024
1238852
merge
abf149 Oct 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 132 additions & 1 deletion tests/multi_step/test_correctness_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ async def test_multi_step_pp_smoke(
Args:
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
eager_mode
monkeypatch: fixture which we use to temporarily override backend env var
"""

model = "JackFram/llama-160m"
Expand Down Expand Up @@ -223,3 +223,134 @@ async def test_multi_step_pp_smoke(
test_generations = get_client_text_generations(test_completions)

assert ref_generations == test_generations


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1])
@pytest.mark.parametrize("pp_size", [1])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("max_output_len", [7])
@pytest.mark.parametrize("n,best_of", [
(1, 3),
(2, 2),
(2, 3),
])
@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("is_async", [False, True])
@pytest.mark.parametrize("num_logprobs", [None, 5])
@pytest.mark.asyncio
async def test_multi_step_llm_best_of_fallback_async(
monkeypatch,
example_prompts,
model: str,
tp_size: int,
pp_size: int,
enforce_eager: int,
num_scheduler_steps: int,
num_prompts: int,
max_output_len: int,
n: int,
best_of: int,
attention_backend: str,
is_async: bool,
num_logprobs: Optional[int],
) -> None:
"""Test vLLM server with multi-step & best_of > 1

Currently multi-step scheduling does not support best_of > 1 or
beam search,
however the default behavior is for the engine to fall back
on single-step
scheduling rather than failing.

Args:
monkeypatch: fixture which we use to temporarily override backend env var
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
tp_size: degree of tensor-parallelism
pp_size: degree of pipeline-parallelism
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
max_output_len
n: num seqs to output per :class:`SequenceGroup`
best_of: num seqs per :class:`SequenceGroup` from which to choose
attention_backend
is_async: if True, use async output processor
num_logprobs: number of logprobs to return per token
"""

override_backend_env_variable(monkeypatch, attention_backend)

prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts

server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"]

if not is_async:
ms_server_args += ["--disable-async-output-proc"]

if enforce_eager:
ms_server_args.append("--enforce-eager")

distributed_args = [
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
]

# Requests will share a random seed
seed = 42

# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts,
model,
server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240,
best_of=best_of,
n=n,
max_tokens=max_output_len,
temperature=1.0,
seed=seed)
test_completions = await completions_with_server_args(
prompts,
model,
ms_server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240,
best_of=best_of,
n=n,
max_tokens=max_output_len,
temperature=1.0,
seed=seed)

# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations

# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
test_text_logprobs = get_client_text_logprob_generations(test_completions)
check_logprobs_close(
outputs_0_lst=ref_text_logprobs,
outputs_1_lst=test_text_logprobs,
name_0="single-step",
name_1="multi-step",
)
169 changes: 167 additions & 2 deletions tests/multi_step/test_correctness_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

import pytest

from vllm import SamplingParams
from vllm.entrypoints.utils import STR_MULTI_STEP_BEAM_SEARCH_NOT_SUPPORTED

from ..models.utils import check_logprobs_close, check_outputs_equal

MODELS = [
Expand Down Expand Up @@ -192,11 +195,173 @@ def test_multi_step_llm_w_prompt_logprobs(
check_logprobs_close(
outputs_0_lst=single_step_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
name_0="single_step_vllm",
name_1="multi_step_vllm",
)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("tp_size", [1])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("max_output_len", [7])
@pytest.mark.parametrize("n,best_of", [
(1, 2),
(2, 2),
(2, 3),
])
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
def test_multi_step_llm_best_of_fallback(
vllm_runner,
example_prompts,
model: str,
dtype: str,
tp_size: int,
enforce_eager: int,
num_scheduler_steps: int,
num_prompts: int,
max_output_len: int,
n: int,
best_of: int,
enable_chunked_prefill: bool,
enable_prefix_caching: bool,
) -> None:
"""Test vLLM engine with multi-step & best_of > 1

Currently multi-step scheduling does not support best_of > 1 or beam search,
however the default behavior is for the engine to fall back on single-step
scheduling rather than failing.

Two instantiations of the sync vLLM engine are tested, one with single-step
and one with multi-step scheduling.

Each instantiation of vLLM is tested in 3 phases:
1. Batch of requests without best_of > 1
2. Batch of requests with best_of > 1
3. Batch of requests without best_of > 1

For the instantiation of vLLM with multi-step scheduling, Phase 1 should use
multi-step scheduling, Phase 2 should fall back on single-step scheduling,
and Phase 3 should resume multi-step scheduling.

The other instantiation should use single-step scheduling for all phases.

Args:
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
tp_size: degree of tensor-parallelism
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
num_prompts: number of example prompts under test
max_output_len: the maximum number of tokens to generate
n: num seqs to output per :class:`SequenceGroup`
best_of: num seqs per :class:`SequenceGroup` from which to choose
enable_chunked_prefill
enable_prefix_caching
"""

prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts

# Sampling parameters with best_of > 1 which should trigger a
# multi-step scheduler to fall back on single-step scheduling
sampling_params_best_of_gt_1 = SamplingParams(
max_tokens=max_output_len,
ignore_eos=True,
temperature=1.0,
n=n,
best_of=best_of,
seed=42,
)

with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
num_scheduler_steps=1,
enable_chunked_prefill=enable_chunked_prefill,
enable_prefix_caching=enable_prefix_caching,
) as vllm_model:
outputs_ss_best_of_gt_1 = vllm_model.generate(
prompts, sampling_params_best_of_gt_1)

with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enable_prefix_caching=enable_prefix_caching,
) as vllm_model:
outputs_ms_best_of_gt_1 = (vllm_model.generate(
prompts, sampling_params_best_of_gt_1))

check_outputs_equal(
outputs_0_lst=outputs_ss_best_of_gt_1,
outputs_1_lst=outputs_ms_best_of_gt_1,
name_0="outputs_ss_best_of_gt_1",
name_1="outputs_ms_best_of_gt_1",
)


@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("num_scheduler_steps", [8])
@pytest.mark.parametrize("max_output_len", [7])
def test_multi_step_beam_search_fail(
vllm_runner,
example_prompts,
model: str,
dtype: str,
enforce_eager: int,
num_scheduler_steps: int,
max_output_len: int,
) -> None:
"""Test that vLLM engine with multi-step fails if beam search is enabled.

Beam search is not supported with multi-step.

Args:
vllm_runner: vLLM model runner fixture
example_prompts: test fixture providing example prompts
model: model under test (same for single- and multi-step engines)
dtype: tensor datatype for engine to utilize
enforce_eager
num_scheduler_steps: for multi-step scheduling, GPU-side steps per
GPU -> CPU output transfer
max_output_len
"""

with pytest.raises(ValueError,
match=STR_MULTI_STEP_BEAM_SEARCH_NOT_SUPPORTED), \
vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=1,
use_v2_block_manager=True,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_model.generate_beam_search(example_prompts, 2, max_output_len)


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("tp_size", [1])
Expand Down
15 changes: 11 additions & 4 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import openai
import pytest
import requests
from openai._types import NOT_GIVEN, NotGiven
from openai.types.completion import Completion
from typing_extensions import ParamSpec, assert_never

Expand Down Expand Up @@ -615,6 +616,10 @@ async def completions_with_server_args(
num_logprobs: Optional[int],
max_wait_seconds: int = 240,
max_tokens: Union[int, list] = 5,
best_of: Union[int, NotGiven] = NOT_GIVEN,
n: Union[int, NotGiven] = NOT_GIVEN,
temperature: Union[float, NotGiven] = 0,
seed: Union[int, NotGiven] = NOT_GIVEN,
) -> List[Completion]:
'''Construct a remote OpenAI server, obtain an async client to the
server & invoke the completions API to obtain completions.
Expand Down Expand Up @@ -647,10 +652,13 @@ async def completions_with_server_args(
client = server.get_async_client()
outputs = [ client.completions.create(model=model_name,
prompt=[p],
temperature=0,
temperature=temperature,
stream=False,
max_tokens=max_tok,
logprobs=num_logprobs) \
logprobs=num_logprobs,
best_of=best_of,
n=n,
seed=seed) \
for p, max_tok in zip(prompts, max_tokens) ]
outputs = await asyncio.gather(*outputs)

Expand All @@ -663,8 +671,7 @@ def get_client_text_generations(completions: List[Completion]) -> List[str]:
'''Extract generated tokens from the output of a
request made to an Open-AI-protocol completions endpoint.
'''
assert all([len(x.choices) == 1 for x in completions])
return [x.choices[0].text for x in completions]
return [c.text for x in completions for c in x.choices]


def get_client_text_logprob_generations(
Expand Down
Loading