Skip to content
This repository was archived by the owner on May 15, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions deploy/components/inference-gateway/inferencemodel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: food-review
spec:
modelName: food-review
criticality: Critical
poolRef:
name: vllm-llama3-8b-instruct
141 changes: 141 additions & 0 deletions deploy/components/inference-gateway/inferencepool.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
labels:
name: vllm-llama3-8b-instruct
spec:
targetPortNumber: 8000
selector:
app: vllm-llama3-8b-instruct
extensionRef:
name: vllm-llama3-8b-instruct-epp
---
apiVersion: v1
kind: Service
metadata:
name: vllm-llama3-8b-instruct-epp
spec:
selector:
app: vllm-llama3-8b-instruct-epp
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama3-8b-instruct-epp
labels:
app: vllm-llama3-8b-instruct-epp
spec:
replicas: 1
selector:
matchLabels:
app: vllm-llama3-8b-instruct-epp
template:
metadata:
labels:
app: vllm-llama3-8b-instruct-epp
spec:
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
terminationGracePeriodSeconds: 130
containers:
- name: epp
# image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
image: gateway-api-inference-extension/epp:latest
imagePullPolicy: IfNotPresent
args:
- -refreshMetricsInterval
- "500ms"
- -poolName
- "vllm-llama3-8b-instruct"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- "9002"
- -grpcHealthPort
- "9003"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencemodels"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- ""
resources:
- "pods"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencepools"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "discovery.k8s.io"
resources:
- "endpointslices"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "authentication.k8s.io"
resources:
- "tokenreviews"
verbs:
- "create"
- apiGroups:
- "authorization.k8s.io"
resources:
- "subjectaccessreviews"
verbs:
- "create"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: pod-read-binding
subjects:
- kind: ServiceAccount
name: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: pod-read
79 changes: 11 additions & 68 deletions deploy/components/vllm-sim/deployments.yaml
Original file line number Diff line number Diff line change
@@ -1,86 +1,29 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30801
labels:
app: vllm-30801
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30801
template:
metadata:
labels:
app: vllm-30801
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30801
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
args:
- "--port=30801"
- "--model=model1"
- "--lora=lora1,lora2"
ports:
- containerPort: 30801
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30802
labels:
app: vllm-30802
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30802
template:
metadata:
labels:
app: vllm-30802
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30802
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
args:
- "--port=30802"
- "--model=model1"
- "--lora=lora1,lora2"
ports:
- containerPort: 30802
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30803
name: vllm-sim
labels:
app: vllm-30803
app: vllm-llama3-8b-instruct
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30803
app: vllm-llama3-8b-instruct
template:
metadata:
labels:
app: vllm-30803
app: vllm-llama3-8b-instruct
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30803
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
image: quay.io/vllm-d/vllm-sim:0.0.1
imagePullPolicy: IfNotPresent
args:
- "--port=30803"
- "--model=model2"
- "--lora=lora3"
- "--port=8000"
- "--model=food-review"
# - "--lora=lora10,lora20,lora30"
# - "--time-to-first-token=500"
ports:
- containerPort: 30803
- containerPort: 8000