|
8 | 8 | """ |
9 | 9 |
|
10 | 10 | from ray import serve |
11 | | -from ray.serve.llm import LLMConfig, LLMRouter, LLMServer |
| 11 | +from ray.serve.llm import LLMConfig, build_openai_app |
12 | 12 |
|
13 | 13 | llm_config = LLMConfig( |
14 | | - model_loading_config=dict( |
15 | | - model_id="deepseek", |
16 | | - # Change to model download path |
17 | | - model_source="/path/to/the/model", |
18 | | - ), |
19 | | - deployment_config=dict(autoscaling_config=dict( |
20 | | - min_replicas=1, |
21 | | - max_replicas=1, |
22 | | - )), |
| 14 | + model_loading_config={ |
| 15 | + "model_id": "deepseek", |
| 16 | + # Since DeepSeek model is huge, it is recommended to pre-download |
| 17 | + # the model to local disk, say /path/to/the/model and specify: |
| 18 | + # model_source="/path/to/the/model" |
| 19 | + "model_source": "deepseek-ai/DeepSeek-R1", |
| 20 | + }, |
| 21 | + deployment_config={ |
| 22 | + "autoscaling_config": { |
| 23 | + "min_replicas": 1, |
| 24 | + "max_replicas": 1, |
| 25 | + } |
| 26 | + }, |
23 | 27 | # Change to the accelerator type of the node |
24 | 28 | accelerator_type="H100", |
25 | | - runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")), |
| 29 | + runtime_env={"env_vars": { |
| 30 | + "VLLM_USE_V1": "1" |
| 31 | + }}, |
26 | 32 | # Customize engine arguments as needed (e.g. vLLM engine kwargs) |
27 | | - engine_kwargs=dict( |
28 | | - tensor_parallel_size=8, |
29 | | - pipeline_parallel_size=2, |
30 | | - gpu_memory_utilization=0.92, |
31 | | - dtype="auto", |
32 | | - max_num_seqs=40, |
33 | | - max_model_len=16384, |
34 | | - enable_chunked_prefill=True, |
35 | | - enable_prefix_caching=True, |
36 | | - trust_remote_code=True, |
37 | | - ), |
| 33 | + engine_kwargs={ |
| 34 | + "tensor_parallel_size": 8, |
| 35 | + "pipeline_parallel_size": 2, |
| 36 | + "gpu_memory_utilization": 0.92, |
| 37 | + "dtype": "auto", |
| 38 | + "max_num_seqs": 40, |
| 39 | + "max_model_len": 16384, |
| 40 | + "enable_chunked_prefill": True, |
| 41 | + "enable_prefix_caching": True, |
| 42 | + "trust_remote_code": True, |
| 43 | + }, |
38 | 44 | ) |
39 | 45 |
|
40 | 46 | # Deploy the application |
41 | | -deployment = LLMServer.as_deployment( |
42 | | - llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config) |
43 | | -llm_app = LLMRouter.as_deployment().bind([deployment]) |
| 47 | +llm_app = build_openai_app({"llm_configs": [llm_config]}) |
44 | 48 | serve.run(llm_app) |
0 commit comments