From 0ecfe561270c9deb1e932ec527f1855429f94865 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 17 Jan 2025 16:07:17 +0200 Subject: [PATCH 1/5] Fix HPU tensor parallelism Signed-off-by: Konrad Zawora --- vllm/config.py | 2 +- vllm/executor/multiproc_worker_utils.py | 17 +++++++++++++++++ vllm/worker/hpu_worker.py | 4 ++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index ac5a4c91b173..bf709d9f2861 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1285,7 +1285,7 @@ def __post_init__(self) -> None: raise ValueError(f"worker-use-ray can't be used with " f"distributed executor backend " f"'{self.distributed_executor_backend}'.") - ray_only_devices = ["tpu", "hpu"] + ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 539b6ae2d357..27472f46dda7 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -12,8 +12,10 @@ import torch +from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import _check_multiproc_method, get_mp_context, run_method @@ -284,6 +286,21 @@ def set_multiprocessing_worker_envs(parallel_config): process before worker processes are created""" _check_multiproc_method() + if (current_platform.is_hpu() + and parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " + "cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" # Configure thread parallelism if OMP_NUM_THREADS isn't set # diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 9401241073c7..8bf1467496a8 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -130,7 +130,6 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, ) -> Optional[List[SamplerOutput]]: - assert execute_model_req is not None # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 @@ -144,7 +143,8 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks: + if log_graph_compilation or log_cpu_fallbacks and \ + execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([ From 68f1a43ef36e2c07ead2fbdef26a81e671640a4d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 17 Jan 2025 16:32:59 +0200 Subject: [PATCH 2/5] oh shoot! i missed some brackets! Signed-off-by: Konrad Zawora --- vllm/worker/hpu_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 8bf1467496a8..3c570212625c 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -143,7 +143,7 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if log_graph_compilation or log_cpu_fallbacks and \ + if (log_graph_compilation or log_cpu_fallbacks) and \ execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list From d8e9efc0a03795dde8cd08cf42795eff7d084329 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 Jan 2025 16:13:20 +0200 Subject: [PATCH 3/5] move mp method checks to hpu.py Signed-off-by: Konrad Zawora --- vllm/executor/multiproc_worker_utils.py | 15 --------------- vllm/platforms/hpu.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 27472f46dda7..c6f0f9721b03 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -286,21 +286,6 @@ def set_multiprocessing_worker_envs(parallel_config): process before worker processes are created""" _check_multiproc_method() - if (current_platform.is_hpu() - and parallel_config.distributed_executor_backend == 'mp' - and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): - if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None: - logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " - "cause application hangs on exit. Using " - "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " - "as it was explicitly requested.") - else: - logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might " - "cause application hangs on exit. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "To override that behavior, please set " - "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" # Configure thread parallelism if OMP_NUM_THREADS isn't set # diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 242c2c127979..a32c262c84ef 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,9 @@ +import os from typing import TYPE_CHECKING, Optional import torch +from vllm import envs from vllm.logger import init_logger from .interface import Platform, PlatformEnum, _Backend @@ -58,6 +60,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 128 + if (parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", + None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning( + "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @classmethod def is_pin_memory_available(cls): From db124cb7a7d0581f44312f2d64123efdf256e6de Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 Jan 2025 16:15:29 +0200 Subject: [PATCH 4/5] update --distributed-executor-backend help Signed-off-by: Konrad Zawora --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a4f4c9558d05..84c320f51ea6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -396,7 +396,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'or equal to the number of GPUs available, "mp" will be used to ' 'keep processing on a single host. Otherwise, this will default ' 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' - 'and hpu only support Ray for distributed inference.') + 'only supports Ray for distributed inference.') parser.add_argument( '--worker-use-ray', From a081b74db14c55164a1056e414f9833f3891296e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 Jan 2025 16:18:55 +0200 Subject: [PATCH 5/5] remove unnecessary imports Signed-off-by: Konrad Zawora --- vllm/executor/multiproc_worker_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index c6f0f9721b03..539b6ae2d357 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -12,10 +12,8 @@ import torch -from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import _check_multiproc_method, get_mp_context, run_method