Fix HPU tensor parallelism

kzawora-intel · kzawora-intel · commit 0ecfe561270c · 2025-01-17T16:07:32.000+02:00
Signed-off-by: Konrad Zawora &lt;kzawora@habana.ai&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -1285,7 +1285,7 @@ def __post_init__(self) -> None:
                 raise ValueError(f"worker-use-ray can't be used with "
                                  f"distributed executor backend "
                                  f"'{self.distributed_executor_backend}'.")
-        ray_only_devices = ["tpu", "hpu"]
+        ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
@@ -12,8 +12,10 @@
 
 import torch
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 
@@ -284,6 +286,21 @@ def set_multiprocessing_worker_envs(parallel_config):
     process before worker processes are created"""
 
     _check_multiproc_method()
+    if (current_platform.is_hpu()
+            and parallel_config.distributed_executor_backend == 'mp'
+            and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
+        if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) is not None:
+            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might "
+                           "cause application hangs on exit. Using "
+                           "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
+                           "as it was explicitly requested.")
+        else:
+            logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork might "
+                           "cause application hangs on exit. Setting "
+                           "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+                           "To override that behavior, please set "
+                           "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
@@ -130,7 +130,6 @@ def execute_model(
         self,
         execute_model_req: Optional[ExecuteModelRequest] = None,
     ) -> Optional[List[SamplerOutput]]:
-        assert execute_model_req is not None
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
         # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
         # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
@@ -144,7 +143,8 @@ def execute_model(
             'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
         log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
                                            '0') != '0' or log_cpu_fallbacks_all
-        if log_graph_compilation or log_cpu_fallbacks:
+        if log_graph_compilation or log_cpu_fallbacks and \
+            execute_model_req is not None:
             from habana_frameworks.torch.hpu.metrics import metric_localcontext
             seq_group_metadata_list = execute_model_req.seq_group_metadata_list
             is_prompt = any([