From 53e0fb4c11efdec3a071dd360fd5475f559009de Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:05:22 -0500
Subject: [PATCH 01/14] remove some

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/entrypoints/llm.py             | 1 +
 vllm/entrypoints/openai/protocol.py | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fc585ee9e54b..affe7bcc55b1 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -96,6 +96,7 @@ class LLM:
             values will increase the KV cache size and thus improve the model's
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
+        # TODO: How to rewrite this
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
             This can be used for temporarily storing the states of the requests
             when their `best_of` sampling parameters are larger than 1. If all
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2c740caf20fb..4c4d86fddb59 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,7 +242,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # doc: begin-chat-completion-sampling-params
-    best_of: Optional[int] = None
     use_beam_search: bool = False
     top_k: Optional[int] = None
     min_p: Optional[float] = None
@@ -479,7 +478,6 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -650,7 +648,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: Optional[str] = None
     prompt: Union[list[int], list[list[int]], str, list[str]]
-    best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[dict[str, float]] = None
@@ -848,7 +845,6 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,

From 9344a840dbd648835246020ec4cb681c351cd994 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:07:09 -0500
Subject: [PATCH 02/14] remove some more

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/entrypoints/openai/serving_completion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index edcf1b086bad..0b4ed781c512 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -171,9 +171,7 @@ async def create_completion(
         # Similar to the OpenAI API, when n != best_of, we do not stream the
         # results. In addition, we do not stream the results when use
         # beam search.
-        stream = (request.stream
-                  and (request.best_of is None or request.n == request.best_of)
-                  and not request.use_beam_search)
+        stream = (request.stream and not request.use_beam_search)
 
         # Streaming response
         if stream:

From cf66e53ed8c67c2b562585b5dec7b3b254072b54 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:08:42 -0500
Subject: [PATCH 03/14] remove some safe ones

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 examples/offline_inference/llm_engine_example.py      | 1 -
 examples/online_serving/opentelemetry/dummy_client.py | 1 -
 tests/core/test_scheduler.py                          | 4 ----
 3 files changed, 6 deletions(-)

diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index f7741a372243..e94f47b72b2e 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -15,7 +15,6 @@ def create_test_prompts() -> list[tuple[str, SamplingParams]]:
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
          SamplingParams(n=2,
-                        best_of=5,
                         temperature=0.8,
                         top_p=0.95,
                         frequency_penalty=0.1)),
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 7a605f85b97f..a8b353090d79 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -28,7 +28,6 @@
         "model": "facebook/opt-125m",
         "prompt": prompt,
         "max_tokens": 10,
-        "best_of": 20,
         "n": 3,
         "use_beam_search": "true",
         "temperature": 0.0,
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 9e461d4e0b40..8bd64923fe22 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -617,7 +617,6 @@ def test_schedule_decode_blocks_to_copy_update():
                                      num_gpu_blocks=16)
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
@@ -686,7 +685,6 @@ def test_schedule_swapped_cannot_swap_in():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -717,7 +715,6 @@ def test_infeasible_swap():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -747,7 +744,6 @@ def test_schedule_swapped_blocks_to_copy():
     curr_loras = None
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)

From 5d0e045b8613acab33ef2e0316d1dbc9cb1744b6 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:10:15 -0500
Subject: [PATCH 04/14] utils remove

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 tests/core/utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index ba4265e3c20a..a726b9f1fc09 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -18,7 +18,6 @@ def create_dummy_prompt(
     prompt_length: int = -1,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
     prompt_tokens: Optional[list[int]] = None,
     min_tokens: int = 0,
     max_tokens: int = 16,
@@ -39,7 +38,6 @@ def create_dummy_prompt(
                               seqs=[prompt],
                               arrival_time=time.time(),
                               sampling_params=SamplingParams(
-                                  best_of=best_of,
                                   max_tokens=max_tokens,
                                   min_tokens=min_tokens),
                               lora_request=lora_request)
@@ -72,7 +70,6 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_length: int,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
 ) -> tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
@@ -102,7 +99,6 @@ def create_dummy_prompt_encoder_decoder(
 
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(best_of=best_of),
                               arrival_time=time.time(),
                               lora_request=lora_request,
                               encoder_seq=encoder_prompt)

From 52e368f3e3ced21447aef1234152a3d387a0ad59 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:20:06 -0500
Subject: [PATCH 05/14] remove some more safe ones

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/entrypoints/llm.py                       |  3 ---
 vllm/entrypoints/openai/serving_completion.py |  4 +---
 vllm/sampling_params.py                       | 17 -----------------
 3 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index affe7bcc55b1..d27e898eb0bd 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -98,9 +98,6 @@ class LLM:
             memory (OOM) errors.
         # TODO: How to rewrite this
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-            This can be used for temporarily storing the states of the requests
-            when their `best_of` sampling parameters are larger than 1. If all
-            requests will have `best_of=1`, you can safely set this to 0.
             Otherwise, too small values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 0b4ed781c512..592f213b6f5e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -168,9 +168,7 @@ async def create_completion(
         model_name = self._get_model_name(request.model, lora_request)
         num_prompts = len(engine_prompts)
 
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use
-        # beam search.
+        # We do not stream the results when use beam search.
         stream = (request.stream and not request.use_beam_search)
 
         # Streaming response
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 17e4e43387dd..4bbc40caadf3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -303,20 +303,6 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of``:
-        # if `best_of`` is not set, we default to `n`;
-        # if `best_of`` is set, we set `n`` to `best_of`,
-        # and set `_real_n`` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
 
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
@@ -423,9 +409,6 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop.")
-        if self.best_of != self._real_n and self.output_kind == (
-                RequestOutputKind.DELTA):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:

From 26b48642e8b44839673eb2e63f36c25263adb2d6 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Thu, 27 Feb 2025 19:21:14 -0500
Subject: [PATCH 06/14] remove args in benchmarks (#1)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 benchmarks/backend_request_func.py | 5 -----
 benchmarks/benchmark_serving.py    | 1 -
 2 files changed, 6 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 158705769b5e..d53428d219e7 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -27,7 +27,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     model_name: Optional[str] = None
-    best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
@@ -58,7 +57,6 @@ async def async_request_tgi(
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
         params = {
-            "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
             "do_sample": True,
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
@@ -130,7 +128,6 @@ async def async_request_trt_llm(
 
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
             "text_input": request_func_input.prompt,
@@ -195,7 +192,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -249,7 +245,6 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
-            "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 16ec0a4817a2..92e939b4fd44 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -592,7 +592,6 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         logprobs=logprobs,
-        best_of=best_of,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )

From 93ac7b7ea4c458bda764db6fe866a981af0d9396 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:23:58 -0500
Subject: [PATCH 07/14] remove the rest

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 benchmarks/benchmark_serving.py | 8 --------
 vllm/sampling_params.py         | 7 -------
 2 files changed, 15 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 92e939b4fd44..89c1d0169880 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -982,7 +982,6 @@ def main(args: argparse.Namespace):
         result_json["backend"] = backend
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
@@ -1080,13 +1079,6 @@ def main(args: argparse.Namespace):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and "
-        "returns the best one.",
-    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--num-prompts",
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4bbc40caadf3..599d52ee670b 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -116,10 +116,6 @@ class SamplingParams(
 
     Args:
         n: Number of output sequences to return for the given prompt.
-        best_of: Number of output sequences that are generated from the prompt.
-            From these `best_of` sequences, the top `n` sequences are returned.
-            `best_of` must be greater than or equal to `n`. By default,
-            `best_of` is set to `n`.
         presence_penalty: Float that penalizes new tokens based on whether they
             appear in the generated text so far. Values > 0 encourage the model
             to use new tokens, while values < 0 encourage the model to repeat
@@ -187,7 +183,6 @@ class SamplingParams(
     """
 
     n: int = 1
-    best_of: Optional[int] = None
     _real_n: Optional[int] = None
     presence_penalty: float = 0.0
     frequency_penalty: float = 0.0
@@ -231,7 +226,6 @@ class SamplingParams(
     @staticmethod
     def from_optional(
         n: Optional[int] = 1,
-        best_of: Optional[int] = None,
         presence_penalty: Optional[float] = 0.0,
         frequency_penalty: Optional[float] = 0.0,
         repetition_penalty: Optional[float] = 1.0,
@@ -270,7 +264,6 @@ def from_optional(
 
         return SamplingParams(
             n=1 if n is None else n,
-            best_of=best_of,
             presence_penalty=0.0
             if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0

From 9a41a7599649cd7269fe19ea4d8b23223a407895 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Thu, 27 Feb 2025 19:27:24 -0500
Subject: [PATCH 08/14] Remove more

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 benchmarks/benchmark_serving.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 89c1d0169880..68ca2dc8f7d2 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -560,7 +560,6 @@ async def benchmark(
     tokenizer: PreTrainedTokenizerBase,
     input_requests: list[tuple[str, int, int]],
     logprobs: Optional[int],
-    best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
@@ -618,7 +617,6 @@ async def benchmark(
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
                                          logprobs=logprobs,
-                                         best_of=best_of,
                                          multi_modal_content=test_mm_content,
                                          ignore_eos=ignore_eos)
         profile_output = await request_func(request_func_input=profile_input)
@@ -667,7 +665,6 @@ async def limited_request_func(request_func_input, pbar):
                                               prompt_len=prompt_len,
                                               output_len=output_len,
                                               logprobs=logprobs,
-                                              best_of=best_of,
                                               multi_modal_content=mm_content,
                                               ignore_eos=ignore_eos)
         tasks.append(
@@ -685,7 +682,6 @@ async def limited_request_func(request_func_input, pbar):
             prompt_len=test_prompt_len,
             output_len=test_output_len,
             logprobs=logprobs,
-            best_of=best_of,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -957,7 +953,6 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
-            best_of=args.best_of,
             request_rate=args.request_rate,
             burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,

From 330324a0e0a97720d3429f6bd3f0626379bfc824 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:31:52 -0500
Subject: [PATCH 09/14] finally

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/entrypoints/llm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d27e898eb0bd..fe2b68257fc6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -96,7 +96,6 @@ class LLM:
             values will increase the KV cache size and thus improve the model's
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
-        # TODO: How to rewrite this
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
             Otherwise, too small values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading

From 2319cce4ac8ee9079371201ab6ce8694536d6600 Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Thu, 27 Feb 2025 19:40:15 -0500
Subject: [PATCH 10/14] pre-commit fix

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 tests/core/utils.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index a726b9f1fc09..ea18b879a317 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -31,16 +31,19 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
 
     prompt_str = " ".join([str(t) for t in prompt_tokens])
-    prompt = Sequence(int(request_id),
-                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
-                      block_size=block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[prompt],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(
-                                  max_tokens=max_tokens,
-                                  min_tokens=min_tokens),
-                              lora_request=lora_request)
+    prompt = Sequence(
+        int(request_id),
+        inputs=token_inputs(prompt_tokens, prompt=prompt_str),
+        block_size=block_size,
+    )
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=[prompt],
+        arrival_time=time.time(),
+        sampling_params=SamplingParams(max_tokens=max_tokens,
+                                       min_tokens=min_tokens),
+        lora_request=lora_request,
+    )
 
     return prompt, seq_group
 

From 22a7c356c98f2d65b6769a5eab7fae9e85bd7e10 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Mar 2025 18:43:59 +0100
Subject: [PATCH 11/14] Update vllm/entrypoints/llm.py

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/entrypoints/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fe2b68257fc6..dd46a1376adf 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -97,7 +97,7 @@ class LLM:
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-            Otherwise, too small values may cause out-of-memory (OOM) errors.
+           Too small values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data

From 333083e8a60970363ced2139b55ca90c57e4061f Mon Sep 17 00:00:00 2001
From: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Date: Wed, 5 Mar 2025 12:54:01 -0500
Subject: [PATCH 12/14] remove the last instance

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 vllm/v1/engine/processor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 713a5d38dfdd..6a2c1c545f1b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,9 +93,6 @@ def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
     ) -> None:
-        # Best of not yet supported.
-        if params.best_of:
-            raise ValueError("VLLM V1 does not yet support best_of.")
         # Bad words not yet supported.
         if params.bad_words:
             raise ValueError("VLLM V1 does not yet support bad_words.")

From 4906f4a21cae67f3c824c8212799892b84213b7c Mon Sep 17 00:00:00 2001
From: Vincent <vincentzhongy+githubvincent4@gmail.com>
Date: Wed, 5 Mar 2025 13:03:29 -0500
Subject: [PATCH 13/14] Update test_sampling_params_e2e.py

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index e47f13f05316..fae10477c974 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -24,15 +24,6 @@ def test_n_gt_1(model):
     outputs = model.generate(PROMPT, params)
     assert len(outputs[0].outputs) == 3
 
-
-def test_best_of(model):
-    """Raise a ValueError since best_of is deprecated."""
-
-    params = SamplingParams(n=2, best_of=3)
-    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, params)
-
-
 def test_penalties(model):
     """Check that we do not get errors if applied."""
 

From 7be479c008d086217ca635ed38414a1baf3eb785 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Mar 2025 19:20:37 +0100
Subject: [PATCH 14/14] Update tests/v1/sample/test_sampling_params_e2e.py

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index fae10477c974..f17d4b77afc7 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -24,6 +24,7 @@ def test_n_gt_1(model):
     outputs = model.generate(PROMPT, params)
     assert len(outputs[0].outputs) == 3
 
+
 def test_penalties(model):
     """Check that we do not get errors if applied."""