vllm-project · afeldman-nm · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
@@ -164,7 +164,7 @@ async def test_multi_step_pp_smoke(
     Args:
       tp_size: degree of tensor-parallelism
       pp_size: degree of pipeline-parallelism
-      eager_mode
+      monkeypatch: fixture which we use to temporarily override backend env var
     """
 
     model = "JackFram/llama-160m"
@@ -223,3 +223,134 @@ async def test_multi_step_pp_smoke(
     test_generations = get_client_text_generations(test_completions)
 
     assert ref_generations == test_generations
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("pp_size", [1])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("max_output_len", [7])
+@pytest.mark.parametrize("n,best_of", [
+    (1, 3),
+    (2, 2),
+    (2, 3),
+])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
+@pytest.mark.parametrize("is_async", [False, True])
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+@pytest.mark.asyncio
+async def test_multi_step_llm_best_of_fallback_async(
+    monkeypatch,
+    example_prompts,
+    model: str,
+    tp_size: int,
+    pp_size: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    max_output_len: int,
+    n: int,
+    best_of: int,
+    attention_backend: str,
+    is_async: bool,
+    num_logprobs: Optional[int],
+) -> None:
+    """Test vLLM server with multi-step & best_of > 1
+
+    Currently multi-step scheduling does not support best_of > 1 or
+    beam search,
+    however the default behavior is for the engine to fall back
+    on single-step
+    scheduling rather than failing.
+
+    Args:
+      monkeypatch: fixture which we use to temporarily override backend env var
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      tp_size: degree of tensor-parallelism
+      pp_size: degree of pipeline-parallelism
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      max_output_len
+      n: num seqs to output per :class:`SequenceGroup`
+      best_of: num seqs per :class:`SequenceGroup` from which to choose
+      attention_backend
+      is_async: if True, use async output processor
+      num_logprobs: number of logprobs to return per token
+    """
+
+    override_backend_env_variable(monkeypatch, attention_backend)
+
+    prompts = example_prompts
+    if len(prompts) < num_prompts:
+        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+    prompts = prompts[:num_prompts]
+    assert len(prompts) == num_prompts
+
+    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+    ms_server_args = DEFAULT_SERVER_ARGS + \
+        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+    if not is_async:
+        ms_server_args += ["--disable-async-output-proc"]
+
+    if enforce_eager:
+        ms_server_args.append("--enforce-eager")
+
+    distributed_args = [
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+    ]
+
+    # Requests will share a random seed
+    seed = 42
+
+    # Spin up client/server & issue completion API requests.
+    # Default `max_wait_seconds` is 240 but was empirically
+    # was raised 3x to 720 *just for this test* due to
+    # observed timeouts in GHA CI
+    ref_completions = await completions_with_server_args(
+        prompts,
+        model,
+        server_args + distributed_args,
+        num_logprobs,
+        max_wait_seconds=5 * 240,
+        best_of=best_of,
+        n=n,
+        max_tokens=max_output_len,
+        temperature=1.0,
+        seed=seed)
+    test_completions = await completions_with_server_args(
+        prompts,
+        model,
+        ms_server_args + distributed_args,
+        num_logprobs,
+        max_wait_seconds=5 * 240,
+        best_of=best_of,
+        n=n,
+        max_tokens=max_output_len,
+        temperature=1.0,
+        seed=seed)
+
+    # Assert multi-step scheduling produces identical tokens
+    # to single-step scheduling.
+    ref_generations = get_client_text_generations(ref_completions)
+    test_generations = get_client_text_generations(test_completions)
+    assert ref_generations == test_generations
+
+    # Assert multi-step scheduling produces nearly-identical logprobs
+    # to single-step scheduling.
+    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
+    test_text_logprobs = get_client_text_logprob_generations(test_completions)
+    check_logprobs_close(
+        outputs_0_lst=ref_text_logprobs,
+        outputs_1_lst=test_text_logprobs,
+        name_0="single-step",
+        name_1="multi-step",
+    )
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
@@ -5,6 +5,9 @@
 
 import pytest
 
+from vllm import SamplingParams
+from vllm.entrypoints.utils import STR_MULTI_STEP_BEAM_SEARCH_NOT_SUPPORTED
+
 from ..models.utils import check_logprobs_close, check_outputs_equal
 
 MODELS = [
@@ -192,11 +195,173 @@ def test_multi_step_llm_w_prompt_logprobs(
     check_logprobs_close(
         outputs_0_lst=single_step_vllm_outputs,
         outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
+        name_0="single_step_vllm",
+        name_1="multi_step_vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("max_output_len", [7])
+@pytest.mark.parametrize("n,best_of", [
+    (1, 2),
+    (2, 2),
+    (2, 3),
+])
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_multi_step_llm_best_of_fallback(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    max_output_len: int,
+    n: int,
+    best_of: int,
+    enable_chunked_prefill: bool,
+    enable_prefix_caching: bool,
+) -> None:
+    """Test vLLM engine with multi-step & best_of > 1
+
+    Currently multi-step scheduling does not support best_of > 1 or beam search,
+    however the default behavior is for the engine to fall back on single-step
+    scheduling rather than failing.
+
+    Two instantiations of the sync vLLM engine are tested, one with single-step
+    and one with multi-step scheduling.
+
+    Each instantiation of vLLM is tested in 3 phases:
+    1. Batch of requests without best_of > 1
+    2. Batch of requests with best_of > 1
+    3. Batch of requests without best_of > 1
+
+    For the instantiation of vLLM with multi-step scheduling, Phase 1 should use
+    multi-step scheduling, Phase 2 should fall back on single-step scheduling,
+    and Phase 3 should resume multi-step scheduling.
+
+    The other instantiation should use single-step scheduling for all phases.
+
+    Args:
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      max_output_len: the maximum number of tokens to generate
+      n: num seqs to output per :class:`SequenceGroup`
+      best_of: num seqs per :class:`SequenceGroup` from which to choose
+      enable_chunked_prefill
+      enable_prefix_caching
+    """
+
+    prompts = example_prompts
+    if len(prompts) < num_prompts:
+        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+    prompts = prompts[:num_prompts]
+    assert len(prompts) == num_prompts
+
+    # Sampling parameters with best_of > 1 which should trigger a
+    # multi-step scheduler to fall back on single-step scheduling
+    sampling_params_best_of_gt_1 = SamplingParams(
+        max_tokens=max_output_len,
+        ignore_eos=True,
+        temperature=1.0,
+        n=n,
+        best_of=best_of,
+        seed=42,
+    )
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            use_v2_block_manager=True,
+            num_scheduler_steps=1,
+            enable_chunked_prefill=enable_chunked_prefill,
+            enable_prefix_caching=enable_prefix_caching,
+    ) as vllm_model:
+        outputs_ss_best_of_gt_1 = vllm_model.generate(
+            prompts, sampling_params_best_of_gt_1)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            use_v2_block_manager=True,
+            num_scheduler_steps=num_scheduler_steps,
+            enable_chunked_prefill=enable_chunked_prefill,
+            enable_prefix_caching=enable_prefix_caching,
+    ) as vllm_model:
+        outputs_ms_best_of_gt_1 = (vllm_model.generate(
+            prompts, sampling_params_best_of_gt_1))
+
+    check_outputs_equal(
+        outputs_0_lst=outputs_ss_best_of_gt_1,
+        outputs_1_lst=outputs_ms_best_of_gt_1,
+        name_0="outputs_ss_best_of_gt_1",
+        name_1="outputs_ms_best_of_gt_1",
     )
 
 
+@pytest.mark.parametrize("model", ["JackFram/llama-160m"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("num_scheduler_steps", [8])
+@pytest.mark.parametrize("max_output_len", [7])
+def test_multi_step_beam_search_fail(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    max_output_len: int,
+) -> None:
+    """Test that vLLM engine with multi-step fails if beam search is enabled.
+
+    Beam search is not supported with multi-step.
+
+    Args:
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      max_output_len
+    """
+
+    with pytest.raises(ValueError,
+                       match=STR_MULTI_STEP_BEAM_SEARCH_NOT_SUPPORTED), \
+         vllm_runner(
+             model,
+             dtype=dtype,
+             enforce_eager=enforce_eager,
+             gpu_memory_utilization=0.7,
+             tensor_parallel_size=1,
+             use_v2_block_manager=True,
+             num_scheduler_steps=num_scheduler_steps,
+         ) as vllm_model:
+        vllm_model.generate_beam_search(example_prompts, 2, max_output_len)
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("tp_size", [1])

diff --git a/tests/utils.py b/tests/utils.py
@@ -13,6 +13,7 @@
 import openai
 import pytest
 import requests
+from openai._types import NOT_GIVEN, NotGiven
 from openai.types.completion import Completion
 from typing_extensions import ParamSpec, assert_never
 
@@ -615,6 +616,10 @@ async def completions_with_server_args(
     num_logprobs: Optional[int],
     max_wait_seconds: int = 240,
     max_tokens: Union[int, list] = 5,
+    best_of: Union[int, NotGiven] = NOT_GIVEN,
+    n: Union[int, NotGiven] = NOT_GIVEN,
+    temperature: Union[float, NotGiven] = 0,
+    seed: Union[int, NotGiven] = NOT_GIVEN,
 ) -> List[Completion]:
     '''Construct a remote OpenAI server, obtain an async client to the
     server & invoke the completions API to obtain completions.
@@ -647,10 +652,13 @@ async def completions_with_server_args(
         client = server.get_async_client()
         outputs = [ client.completions.create(model=model_name,
                                               prompt=[p],
-                                              temperature=0,
+                                              temperature=temperature,
                                               stream=False,
                                               max_tokens=max_tok,
-                                              logprobs=num_logprobs) \
+                                              logprobs=num_logprobs,
+                                              best_of=best_of,
+                                              n=n,
+                                              seed=seed) \
                     for p, max_tok in zip(prompts, max_tokens) ]
         outputs = await asyncio.gather(*outputs)
 
@@ -663,8 +671,7 @@ def get_client_text_generations(completions: List[Completion]) -> List[str]:
     '''Extract generated tokens from the output of a
     request made to an Open-AI-protocol completions endpoint.
     '''
-    assert all([len(x.choices) == 1 for x in completions])
-    return [x.choices[0].text for x in completions]
+    return [c.text for x in completions for c in x.choices]
 
 
 def get_client_text_logprob_generations(