Skip to content

Commit 8004347

Browse files
authored
Merge pull request kubernetes-sigs#24 from mayabar/dev
Add inference model and pool yamls
2 parents be9c800 + e41f154 commit 8004347

File tree

3 files changed

+161
-68
lines changed

3 files changed

+161
-68
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: inference.networking.x-k8s.io/v1alpha2
2+
kind: InferenceModel
3+
metadata:
4+
name: food-review
5+
spec:
6+
modelName: food-review
7+
criticality: Critical
8+
poolRef:
9+
name: vllm-llama3-8b-instruct
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
apiVersion: inference.networking.x-k8s.io/v1alpha2
2+
kind: InferencePool
3+
metadata:
4+
labels:
5+
name: vllm-llama3-8b-instruct
6+
spec:
7+
targetPortNumber: 8000
8+
selector:
9+
app: vllm-llama3-8b-instruct
10+
extensionRef:
11+
name: vllm-llama3-8b-instruct-epp
12+
---
13+
apiVersion: v1
14+
kind: Service
15+
metadata:
16+
name: vllm-llama3-8b-instruct-epp
17+
spec:
18+
selector:
19+
app: vllm-llama3-8b-instruct-epp
20+
ports:
21+
- protocol: TCP
22+
port: 9002
23+
targetPort: 9002
24+
appProtocol: http2
25+
type: ClusterIP
26+
---
27+
apiVersion: apps/v1
28+
kind: Deployment
29+
metadata:
30+
name: vllm-llama3-8b-instruct-epp
31+
labels:
32+
app: vllm-llama3-8b-instruct-epp
33+
spec:
34+
replicas: 1
35+
selector:
36+
matchLabels:
37+
app: vllm-llama3-8b-instruct-epp
38+
template:
39+
metadata:
40+
labels:
41+
app: vllm-llama3-8b-instruct-epp
42+
spec:
43+
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
44+
terminationGracePeriodSeconds: 130
45+
containers:
46+
- name: epp
47+
# image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
48+
image: gateway-api-inference-extension/epp:latest
49+
imagePullPolicy: IfNotPresent
50+
args:
51+
- -refreshMetricsInterval
52+
- "500ms"
53+
- -poolName
54+
- "vllm-llama3-8b-instruct"
55+
- -v
56+
- "4"
57+
- --zap-encoder
58+
- "json"
59+
- -grpcPort
60+
- "9002"
61+
- -grpcHealthPort
62+
- "9003"
63+
ports:
64+
- containerPort: 9002
65+
- containerPort: 9003
66+
- name: metrics
67+
containerPort: 9090
68+
livenessProbe:
69+
grpc:
70+
port: 9003
71+
service: inference-extension
72+
initialDelaySeconds: 5
73+
periodSeconds: 10
74+
readinessProbe:
75+
grpc:
76+
port: 9003
77+
service: inference-extension
78+
initialDelaySeconds: 5
79+
periodSeconds: 10
80+
---
81+
kind: Role
82+
apiVersion: rbac.authorization.k8s.io/v1
83+
metadata:
84+
name: pod-read
85+
rules:
86+
- apiGroups:
87+
- "inference.networking.x-k8s.io"
88+
resources:
89+
- "inferencemodels"
90+
verbs:
91+
- "get"
92+
- "watch"
93+
- "list"
94+
- apiGroups:
95+
- ""
96+
resources:
97+
- "pods"
98+
verbs:
99+
- "get"
100+
- "watch"
101+
- "list"
102+
- apiGroups:
103+
- "inference.networking.x-k8s.io"
104+
resources:
105+
- "inferencepools"
106+
verbs:
107+
- "get"
108+
- "watch"
109+
- "list"
110+
- apiGroups:
111+
- "discovery.k8s.io"
112+
resources:
113+
- "endpointslices"
114+
verbs:
115+
- "get"
116+
- "watch"
117+
- "list"
118+
- apiGroups:
119+
- "authentication.k8s.io"
120+
resources:
121+
- "tokenreviews"
122+
verbs:
123+
- "create"
124+
- apiGroups:
125+
- "authorization.k8s.io"
126+
resources:
127+
- "subjectaccessreviews"
128+
verbs:
129+
- "create"
130+
---
131+
apiVersion: rbac.authorization.k8s.io/v1
132+
kind: RoleBinding
133+
metadata:
134+
name: pod-read-binding
135+
subjects:
136+
- kind: ServiceAccount
137+
name: default
138+
roleRef:
139+
apiGroup: rbac.authorization.k8s.io
140+
kind: Role
141+
name: pod-read
Lines changed: 11 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,29 @@
1-
apiVersion: apps/v1
2-
kind: Deployment
3-
metadata:
4-
name: vllm-30801
5-
labels:
6-
app: vllm-30801
7-
spec:
8-
replicas: 1
9-
selector:
10-
matchLabels:
11-
app: vllm-30801
12-
template:
13-
metadata:
14-
labels:
15-
app: vllm-30801
16-
ai-aware-router-pod: "true"
17-
annotations:
18-
ai-aware-router-address: 127.0.0.1:30801
19-
spec:
20-
containers:
21-
- name: vllm
22-
image: vllm-sim/vllm-sim:latest
23-
args:
24-
- "--port=30801"
25-
- "--model=model1"
26-
- "--lora=lora1,lora2"
27-
ports:
28-
- containerPort: 30801
29-
---
30-
apiVersion: apps/v1
31-
kind: Deployment
32-
metadata:
33-
name: vllm-30802
34-
labels:
35-
app: vllm-30802
36-
spec:
37-
replicas: 1
38-
selector:
39-
matchLabels:
40-
app: vllm-30802
41-
template:
42-
metadata:
43-
labels:
44-
app: vllm-30802
45-
ai-aware-router-pod: "true"
46-
annotations:
47-
ai-aware-router-address: 127.0.0.1:30802
48-
spec:
49-
containers:
50-
- name: vllm
51-
image: vllm-sim/vllm-sim:latest
52-
args:
53-
- "--port=30802"
54-
- "--model=model1"
55-
- "--lora=lora1,lora2"
56-
ports:
57-
- containerPort: 30802
581
---
592
apiVersion: apps/v1
603
kind: Deployment
614
metadata:
62-
name: vllm-30803
5+
name: vllm-sim
636
labels:
64-
app: vllm-30803
7+
app: vllm-llama3-8b-instruct
658
spec:
669
replicas: 1
6710
selector:
6811
matchLabels:
69-
app: vllm-30803
12+
app: vllm-llama3-8b-instruct
7013
template:
7114
metadata:
7215
labels:
73-
app: vllm-30803
16+
app: vllm-llama3-8b-instruct
7417
ai-aware-router-pod: "true"
75-
annotations:
76-
ai-aware-router-address: 127.0.0.1:30803
7718
spec:
7819
containers:
7920
- name: vllm
80-
image: vllm-sim/vllm-sim:latest
21+
image: quay.io/vllm-d/vllm-sim:0.0.1
22+
imagePullPolicy: IfNotPresent
8123
args:
82-
- "--port=30803"
83-
- "--model=model2"
84-
- "--lora=lora3"
24+
- "--port=8000"
25+
- "--model=food-review"
26+
# - "--lora=lora10,lora20,lora30"
27+
# - "--time-to-first-token=500"
8528
ports:
86-
- containerPort: 30803
29+
- containerPort: 8000

0 commit comments

Comments
 (0)