File tree Expand file tree Collapse file tree 1 file changed +78
-1
lines changed Expand file tree Collapse file tree 1 file changed +78
-1
lines changed Original file line number Diff line number Diff line change 47
47
token : " REPLACE_WITH_TOKEN"
48
48
` ` `
49
49
50
- Create a deployment file for vLLM to run the model server. The following example deploys the ` Mistral-7B-Instruct-v0.3` model:
50
+ Next to create the deployment file for vLLM to run the model server. The following example deploys the ` Mistral-7B-Instruct-v0.3` model.
51
+
52
+ Here are two examples for using NVIDIA GPU and AMD GPU.
53
+
54
+ - NVIDIA GPU
51
55
52
56
` ` ` yaml
53
57
apiVersion: apps/v1
@@ -119,6 +123,79 @@ spec:
119
123
periodSeconds: 5
120
124
` ` `
121
125
126
+ - AMD GPU
127
+
128
+ You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
129
+
130
+ ` ` ` yaml
131
+ apiVersion: apps/v1
132
+ kind: Deployment
133
+ metadata:
134
+ name: mistral-7b
135
+ namespace: default
136
+ labels:
137
+ app: mistral-7b
138
+ spec:
139
+ replicas: 1
140
+ selector:
141
+ matchLabels:
142
+ app: mistral-7b
143
+ template:
144
+ metadata:
145
+ labels:
146
+ app: mistral-7b
147
+ spec:
148
+ volumes:
149
+ # PVC
150
+ - name: cache-volume
151
+ persistentVolumeClaim:
152
+ claimName: mistral-7b
153
+ # vLLM needs to access the host's shared memory for tensor parallel inference.
154
+ - name: shm
155
+ emptyDir:
156
+ medium: Memory
157
+ sizeLimit: "8Gi"
158
+ hostNetwork: true
159
+ hostIPC: true
160
+ containers:
161
+ - name: mistral-7b
162
+ image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
163
+ securityContext:
164
+ seccompProfile:
165
+ type: Unconfined
166
+ runAsGroup: 44
167
+ capabilities:
168
+ add:
169
+ - SYS_PTRACE
170
+ command: ["/bin/sh", "-c"]
171
+ args: [
172
+ "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024"
173
+ ]
174
+ env:
175
+ - name: HUGGING_FACE_HUB_TOKEN
176
+ valueFrom:
177
+ secretKeyRef:
178
+ name: hf-token-secret
179
+ key: token
180
+ ports:
181
+ - containerPort: 8000
182
+ resources:
183
+ limits:
184
+ cpu: "10"
185
+ memory: 20G
186
+ amd.com/gpu: "1"
187
+ requests:
188
+ cpu: "6"
189
+ memory: 6G
190
+ amd.com/gpu: "1"
191
+ volumeMounts:
192
+ - name: cache-volume
193
+ mountPath: /root/.cache/huggingface
194
+ - name: shm
195
+ mountPath: /dev/shm
196
+ ` ` `
197
+ You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
198
+
122
199
2. **Create a Kubernetes Service for vLLM**
123
200
124
201
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment :
You can’t perform that action at this time.
0 commit comments