-
Notifications
You must be signed in to change notification settings - Fork 6.9k
Description
What happened + What you expected to happen
Env
using vLLM 0.7.2 model DeepSeek-R1 671b , cuda
nvidia-smi
(base) [root@adbpg-h20-test ~]# nvidia-smi
Thu Feb 27 19:11:35 2025
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA H20 Off | 00000000:00:01.0 Off | 0 |
| N/A 35C P0 116W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA H20 Off | 00000000:00:02.0 Off | 0 |
| N/A 32C P0 113W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 2 NVIDIA H20 Off | 00000000:00:03.0 Off | 0 |
| N/A 35C P0 117W / 500W | 320MiB/ 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 3 NVIDIA H20 Off | 00000000:00:04.0 Off | 0 |
| N/A 32C P0 113W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 4 NVIDIA H20 Off | 00000000:00:05.0 Off | 0 |
| N/A 31C P0 114W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 5 NVIDIA H20 Off | 00000000:00:06.0 Off | 0 |
| N/A 35C P0 120W / 500W | 320MiB/ 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 6 NVIDIA H20 Off | 00000000:00:07.0 Off | 0 |
| N/A 32C P0 113W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
| 7 NVIDIA H20 Off | 00000000:00:08.0 Off | 0 |
| N/A 35C P0 114W / 500W | 320MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
+---------------------------------------------------------------------------------------+
start
serve run serve-r1:build_app model="/data/DeepSeek-R1" pipeline-parallel-size=1 tensor-parallel-size=8 accelerator="GPU" max-model-len=4096
problem
serve log info: load model using triton MLA then hang.... that's the problem, have no problem in vLLM 0.6.5, but only 0.7.2 support the DeepSeekR1, so i turn to the 0.7.2 version .
Versions / Dependencies
-
ray --version
2025-02-27 19:19:42,359 - INFO - Note: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2025-02-27 19:19:42,359 - INFO - Note: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2025-02-27 19:19:42,359 - INFO - NumExpr defaulting to 8 threads.
ray, version 2.40.0 -
vllm 0.7.2
-
nvcc:
NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0
Reproduction script
code serve-r1.py
import os
from typing import Dict, Optional, List
import logging
from fastapi import FastAPI
from starlette.requests import Request
from starlette.responses import StreamingResponse, JSONResponse
from ray import serve
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
ErrorResponse,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
#from vllm.entrypoints.openai.serving_model import LoRAModulePath, PromptAdapterPath
from vllm.utils import FlexibleArgumentParser
from vllm.entrypoints.logger import RequestLogger
from dataclasses import dataclass
logger = logging.getLogger("ray.serve")
@dataclass
class BaseModelPath:
name: str
model_path: str
local_models = [BaseModelPath(name="/data/DeepSeek-R1/", model_path="DeepSeek-R1")]
app = FastAPI()
@serve.deployment(
autoscaling_config={
"min_replicas": 1,
"max_replicas": 10,
"target_ongoing_requests": 5,
},
max_ongoing_requests=10,
)
@serve.ingress(app)
class VLLMDeployment:
def __init__(
self,
engine_args: AsyncEngineArgs,
response_role: str,
request_logger: Optional[RequestLogger] = None,
chat_template: Optional[str] = None,
chat_template_content_format: Optional[str] = None,
):
logger.info(f"Starting with engine args: {engine_args}")
#os.environ.pop("CUDA_VISIBLE_DEVICES", None)
self.openai_serving_chat = None
self.engine_args = engine_args
self.response_role = response_role
self.request_logger = request_logger
self.chat_template = chat_template
self.chat_template_content_format = chat_template_content_format
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@app.post("/v1/chat/completions")
async def create_chat_completion(
self, request: ChatCompletionRequest, raw_request: Request
):
if not self.openai_serving_chat:
model_config1 = await self.engine.get_model_config()
# Determine the name of the served model for the OpenAI client.
if self.engine_args.served_model_name is not None:
served_model_names = self.engine_args.served_model_name
else:
served_model_names = [self.engine_args.model]
serving_models = OpenAIServingModels(
engine_client=self.engine,
model_config=model_config1,
base_model_paths=local_models,
lora_modules=None,
prompt_adapters=None,
)
self.openai_serving_chat = OpenAIServingChat(
self.engine,
model_config1,
serving_models,
self.response_role,
request_logger=self.request_logger,
chat_template=self.chat_template,
chat_template_content_format=self.chat_template_content_format,
)
logger.info(f"Request: {request}")
generator = await self.openai_serving_chat.create_chat_completion(
request, raw_request
)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.code
)
if request.stream:
return StreamingResponse(content=generator, media_type="text/event-stream")
else:
assert isinstance(generator, ChatCompletionResponse)
return JSONResponse(content=generator.model_dump())
def parse_vllm_args(cli_args: Dict[str, str]):
arg_parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server."
)
parser = make_arg_parser(arg_parser)
arg_strings = []
for key, value in cli_args.items():
arg_strings.extend([f"--{key}", str(value)])
logger.info(arg_strings)
parsed_args = parser.parse_args(args=arg_strings)
return parsed_args
def build_app(cli_args: Dict[str, str]) -> serve.Application:
if "accelerator" in cli_args.keys():
accelerator = cli_args.pop("accelerator")
else:
accelerator = "GPU"
parsed_args = parse_vllm_args(cli_args)
engine_args = AsyncEngineArgs.from_cli_args(parsed_args)
engine_args.worker_use_ray = True
tp = engine_args.tensor_parallel_size
pp = engine_args.pipeline_parallel_size
logger.info(f"Tensor parallelism = {tp}")
pg_resources = []
pg_resources.append({"CPU": 1}) # for the deployment replica
for i in range(tp*pp):
pg_resources.append({"GPU": 1, accelerator: 1}) # for the vLLM actors
# We use the "STRICT_PACK" strategy below to ensure all vLLM actors are placed on
# the same Ray node.But here may multi-node use spread
return VLLMDeployment.options(
placement_group_bundles=pg_resources, placement_group_strategy="SPREAD"
).bind(
engine_args,
parsed_args.response_role,
cli_args.get("request_logger"),
parsed_args.chat_template,
)
Issue Severity
High: It blocks me from completing my task.