From 8f233dbbe0f8fae13fa22ef86b79543207b22888 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:35:31 +0000 Subject: [PATCH 01/10] Change default value of `generation_config` from `None` to `"auto"` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 16 +++++++--------- vllm/engine/arg_utils.py | 12 ++++++------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index f6bd8b1ad8f1..7f30b0b051db 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -225,7 +225,7 @@ def __init__( override_neuron_config: Optional[Dict[str, Any]] = None, override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, - generation_config: Optional[str] = None, + generation_config: Optional[str] = "auto", enable_sleep_mode: bool = False, override_generation_config: Optional[Dict[str, Any]] = None, ) -> None: @@ -889,7 +889,7 @@ def get_multimodal_config(self) -> "MultiModalConfig": return self.multimodal_config def try_get_generation_config(self) -> Dict[str, Any]: - if self.generation_config is None or self.generation_config == "auto": + if self.generation_config in ("auto", "ignore"): config = try_get_generation_config( self.model, trust_remote_code=self.trust_remote_code, @@ -909,17 +909,15 @@ def try_get_generation_config(self) -> Dict[str, Any]: def get_diff_sampling_param(self) -> Dict[str, Any]: """ This method returns a dictionary containing the parameters - that differ from the default sampling parameters, but only - if `generation_config` is set. If `generation_config` is not - set, an empty dictionary is returned. + that differ from the default sampling parameters. If + `generation_config` is `"ignore"`, an empty dictionary is returned. Returns: Dict[str, Any]: A dictionary with the differing sampling - parameters if `generation_config` is set, otherwise an - empty dictionary. + parameters, if `generation_config` is `"ignore"` an empty + dictionary. """ - if self.generation_config is None: - # When generation_config is not set + if self.generation_config == "ignore": config = {} else: config = self.try_get_generation_config() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cc7c99e50ac4..90cef1b9aff7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -934,13 +934,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( "--generation-config", type=nullable_str, - default=None, + default="auto", help="The folder path to the generation config. " - "Defaults to None, no generation config is loaded, vLLM defaults " - "will be used. If set to 'auto', the generation config will be " - "loaded from model path. If set to a folder path, the generation " - "config will be loaded from the specified folder path. If " - "`max_new_tokens` is specified in generation config, then " + "Defaults to 'auto', the generation config will be loaded from " + "model path. If set to 'ignore', no generation config is loaded, " + "vLLM defaults will be used. If set to a folder path, the " + "generation config will be loaded from the specified folder path. " + "If `max_new_tokens` is specified in generation config, then " "it sets a server-wide limit on the number of output tokens " "for all requests.") From 4f37698a2cac4f983b02f50487589f9251556e52 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:35:56 +0000 Subject: [PATCH 02/10] Delete now obsolete example Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../basic_with_model_default_sampling.py | 30 ------------------- 1 file changed, 30 deletions(-) delete mode 100644 examples/offline_inference/basic_with_model_default_sampling.py diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py deleted file mode 100644 index 346bb80b1e23..000000000000 --- a/examples/offline_inference/basic_with_model_default_sampling.py +++ /dev/null @@ -1,30 +0,0 @@ -from vllm import LLM - -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] - -# Create an LLM with built-in default generation config. -# The generation config is set to None by default to keep -# the behavior consistent with the previous version. -# If you want to use the default generation config from the model, -# you should set the generation_config to "auto". -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto") - -# Load the default sampling parameters from the model. -sampling_params = llm.get_default_sampling_params() -# Modify the sampling parameters if needed. -sampling_params.temperature = 0.5 - -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From 962e245c96a756c06d4f256e9d0f512dcb4bf3d0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 31 Jan 2025 18:10:42 +0100 Subject: [PATCH 03/10] `ignore` -> `vllm` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 9 ++++----- vllm/engine/arg_utils.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7f30b0b051db..07e854cdb08e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -889,7 +889,7 @@ def get_multimodal_config(self) -> "MultiModalConfig": return self.multimodal_config def try_get_generation_config(self) -> Dict[str, Any]: - if self.generation_config in ("auto", "ignore"): + if self.generation_config in ("auto", "vllm"): config = try_get_generation_config( self.model, trust_remote_code=self.trust_remote_code, @@ -910,14 +910,13 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: """ This method returns a dictionary containing the parameters that differ from the default sampling parameters. If - `generation_config` is `"ignore"`, an empty dictionary is returned. + `generation_config` is `"vllm"`, an empty dictionary is returned. Returns: Dict[str, Any]: A dictionary with the differing sampling - parameters, if `generation_config` is `"ignore"` an empty - dictionary. + parameters, if `generation_config` is `"vllm"` an empty dictionary. """ - if self.generation_config == "ignore": + if self.generation_config == "vllm": config = {} else: config = self.try_get_generation_config() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 90cef1b9aff7..b98fce66b269 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -937,7 +937,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default="auto", help="The folder path to the generation config. " "Defaults to 'auto', the generation config will be loaded from " - "model path. If set to 'ignore', no generation config is loaded, " + "model path. If set to 'vllm', no generation config is loaded, " "vLLM defaults will be used. If set to a folder path, the " "generation config will be loaded from the specified folder path. " "If `max_new_tokens` is specified in generation config, then " From 04f088fee10c80fe67a046310203ab9898d6fa09 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 31 Jan 2025 18:34:36 +0100 Subject: [PATCH 04/10] Improve info logs Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/llm.py | 4 ++++ vllm/entrypoints/openai/serving_chat.py | 6 ++++-- vllm/entrypoints/openai/serving_completion.py | 7 ++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 46b595b0da73..1e17c82337f5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -268,6 +268,10 @@ def get_default_sampling_params(self) -> SamplingParams: diff_sampling_param = ( self.llm_engine.model_config.get_diff_sampling_param()) if diff_sampling_param: + source = self.llm_engine.model_config.generation_config + source = "model" if source == "auto" else source + logger.info("Using default sampling params from %s: %s", source, + diff_sampling_param) return SamplingParams.from_optional(**diff_sampling_param) return SamplingParams() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index dc97f0eb059d..ffb33c157a69 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -103,8 +103,10 @@ def __init__( self.enable_prompt_tokens_details = enable_prompt_tokens_details diff_sampling_param = self.model_config.get_diff_sampling_param() if diff_sampling_param: - logger.info("Overwriting default chat sampling param with: %s", - diff_sampling_param) + source = self.model_config.generation_config + source = "model" if source == "auto" else source + logger.info("Using default chat sampling params from %s: %s", + source, diff_sampling_param) async def create_chat_completion( self, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 13c392636889..b00d00b4c62e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -51,9 +51,10 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids) diff_sampling_param = self.model_config.get_diff_sampling_param() if diff_sampling_param: - logger.info( - "Overwriting default completion sampling param with: %s", - diff_sampling_param) + source = self.model_config.generation_config + source = "model" if source == "auto" else source + logger.info("Using default completion sampling params from %s: %s", + source, diff_sampling_param) async def create_completion( self, From bdfe06644a57ba86ff4c236fdc59e43de52c2481 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 28 Feb 2025 13:17:02 +0100 Subject: [PATCH 05/10] Change default in `EngineArgs` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9c888b93968d..a9b30aef6da2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -205,7 +205,7 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None - generation_config: Optional[str] = None + generation_config: Optional[str] = "auto" override_generation_config: Optional[Dict[str, Any]] = None enable_sleep_mode: bool = False model_impl: str = "auto" From d9d3ed0aa9c8b368355265502bc36ae6d488533c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 28 Feb 2025 13:31:15 +0100 Subject: [PATCH 06/10] Remove `Optional` so mypy doesn't think it can be `None` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 1f9734e5e147..91a6b9011491 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -254,7 +254,7 @@ def __init__( override_neuron_config: Optional[Dict[str, Any]] = None, override_pooler_config: Optional["PoolerConfig"] = None, logits_processor_pattern: Optional[str] = None, - generation_config: Optional[str] = "auto", + generation_config: str = "auto", enable_sleep_mode: bool = False, override_generation_config: Optional[Dict[str, Any]] = None, model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, From 44470235bec3b0747b92d6c68e2f036adb1a8faa Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 3 Mar 2025 17:45:16 +0100 Subject: [PATCH 07/10] Fix `Engine Test` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/test_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index 709d60b83670..06264c5b99b9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -289,7 +289,7 @@ def test_uses_mrope(model_id, uses_mrope): def test_generation_config_loading(): model_id = "Qwen/Qwen2.5-1.5B-Instruct" - # When set generation_config to None, the default generation config + # When set generation_config to "vllm", the default generation config # will not be loaded. model_config = ModelConfig(model_id, task="auto", @@ -298,7 +298,7 @@ def test_generation_config_loading(): trust_remote_code=False, seed=0, dtype="float16", - generation_config=None) + generation_config="vllm") assert model_config.get_diff_sampling_param() == {} # When set generation_config to "auto", the default generation config @@ -340,7 +340,7 @@ def test_generation_config_loading(): assert model_config.get_diff_sampling_param() == override_result - # When generation_config is set to None and override_generation_config + # When generation_config is set to "vllm" and override_generation_config # is set, the override_generation_config should be used directly. model_config = ModelConfig( model_id, @@ -350,7 +350,7 @@ def test_generation_config_loading(): trust_remote_code=False, seed=0, dtype="float16", - generation_config=None, + generation_config="vllm", override_generation_config=override_generation_config) assert model_config.get_diff_sampling_param() == override_generation_config From 7ce0efaff85d363ac0d91d48bc217b66f45ca5cf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 3 Mar 2025 19:47:58 +0100 Subject: [PATCH 08/10] Do not read sampling params from file at runtime Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/serving_chat.py | 14 ++++++-------- vllm/entrypoints/openai/serving_completion.py | 14 ++++++-------- vllm/entrypoints/openai/serving_transcription.py | 10 +++++----- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 931a644a0cfd..53caa1791ca3 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -105,12 +105,13 @@ def __init__( "been registered") from e self.enable_prompt_tokens_details = enable_prompt_tokens_details - diff_sampling_param = self.model_config.get_diff_sampling_param() - if diff_sampling_param: + self.default_sampling_params = ( + self.model_config.get_diff_sampling_param()) + if self.default_sampling_params: source = self.model_config.generation_config source = "model" if source == "auto" else source logger.info("Using default chat sampling params from %s: %s", - source, diff_sampling_param) + source, self.default_sampling_params) async def create_chat_completion( self, @@ -212,17 +213,14 @@ async def create_chat_completion( sampling_params: Union[SamplingParams, BeamSearchParams] default_max_tokens = self.max_model_len - len( engine_prompt["prompt_token_ids"]) - # Build default sampling params - default_sampling_params = ( - self.model_config.get_diff_sampling_param()) if request.use_beam_search: sampling_params = request.to_beam_search_params( - default_max_tokens, default_sampling_params) + default_max_tokens, self.default_sampling_params) else: sampling_params = request.to_sampling_params( default_max_tokens, self.model_config.logits_processor_pattern, - default_sampling_params) + self.default_sampling_params) self._log_inputs(request_id, request_prompts[i], diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 549238b1aa2d..506a1febf272 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -51,12 +51,13 @@ def __init__( models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) - diff_sampling_param = self.model_config.get_diff_sampling_param() - if diff_sampling_param: + self.default_sampling_params = ( + self.model_config.get_diff_sampling_param()) + if self.default_sampling_params: source = self.model_config.generation_config source = "model" if source == "auto" else source logger.info("Using default completion sampling params from %s: %s", - source, diff_sampling_param) + source, self.default_sampling_params) async def create_completion( self, @@ -120,17 +121,14 @@ async def create_completion( sampling_params: Union[SamplingParams, BeamSearchParams] default_max_tokens = self.max_model_len - len( engine_prompt["prompt_token_ids"]) - # Build default sampling params - default_sampling_params = ( - self.model_config.get_diff_sampling_param()) if request.use_beam_search: sampling_params = request.to_beam_search_params( - default_max_tokens, default_sampling_params) + default_max_tokens, self.default_sampling_params) else: sampling_params = request.to_sampling_params( default_max_tokens, self.model_config.logits_processor_pattern, - default_sampling_params) + self.default_sampling_params) request_id_item = f"{request_id}-{i}" diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 77f016a5e0a4..402a0bb7a6b0 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -161,11 +161,12 @@ def __init__( request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) - diff_sampling_param = self.model_config.get_diff_sampling_param() - if diff_sampling_param: + self.default_sampling_params = ( + self.model_config.get_diff_sampling_param()) + if self.default_sampling_params: logger.info( "Overwriting default completion sampling param with: %s", - diff_sampling_param) + self.default_sampling_params) async def _preprocess_transcription( self, @@ -273,9 +274,8 @@ async def create_transcription( try: # TODO(rob): subtract len of tokenized prompt. default_max_tokens = self.model_config.max_model_len - default_params = self.model_config.get_diff_sampling_param() sampling_params = request.to_sampling_params( - default_max_tokens, default_params) + default_max_tokens, self.default_sampling_params) self._log_inputs( request_id, From 3bbb647183f02bc6750245b8590709023921233b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 4 Mar 2025 18:16:37 +0100 Subject: [PATCH 09/10] Fix some entrypoint tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_serving_chat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 1e7dbaf60dc0..19d16713b209 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -38,6 +38,7 @@ class MockModelConfig: diff_sampling_param: Optional[dict] = None allowed_local_media_path: str = "" encoder_config = None + generation_config: str = "auto" def get_diff_sampling_param(self): return self.diff_sampling_param or {} From 0a981975978c2f664ba5f183f1370a7a9b4b5d21 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 7 Mar 2025 15:22:03 +0100 Subject: [PATCH 10/10] Lower threshold for `lm_eval` tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/correctness/test_lmeval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 902df929e782..e4c087db3d4f 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -20,7 +20,7 @@ TASK = "gsm8k" FILTER = "exact_match,strict-match" RTOL = 0.03 -EXPECTED_VALUE = 0.58 +EXPECTED_VALUE = 0.54 DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] MORE_ARGS_LIST = [ [], # Default