neuralmagic · shaneutt · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/deploy/components/inference-gateway/inferencemodel.yaml b/deploy/components/inference-gateway/inferencemodel.yaml
@@ -0,0 +1,9 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: food-review
+spec:
+  modelName: food-review
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
diff --git a/deploy/components/inference-gateway/inferencepool.yaml b/deploy/components/inference-gateway/inferencepool.yaml
@@ -0,0 +1,141 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  labels:
+  name: vllm-llama3-8b-instruct
+spec:
+  targetPortNumber: 8000
+  selector:
+    app: vllm-llama3-8b-instruct
+  extensionRef:
+    name: vllm-llama3-8b-instruct-epp
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama3-8b-instruct-epp
+spec:
+  selector:
+    app: vllm-llama3-8b-instruct-epp
+  ports:
+    - protocol: TCP
+      port: 9002
+      targetPort: 9002
+      appProtocol: http2
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama3-8b-instruct-epp
+  labels:
+    app: vllm-llama3-8b-instruct-epp
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct-epp
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct-epp
+    spec:
+      # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
+      terminationGracePeriodSeconds: 130
+      containers:
+      - name: epp
+#        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        image: gateway-api-inference-extension/epp:latest
+        imagePullPolicy: IfNotPresent
+        args:
+        - -refreshMetricsInterval
+        - "500ms"
+        - -poolName
+        - "vllm-llama3-8b-instruct"
+        - -v
+        - "4"
+        - --zap-encoder
+        - "json"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        ports:
+        - containerPort: 9002
+        - containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+kind: Role
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+rules:
+- apiGroups:
+  - "inference.networking.x-k8s.io"
+  resources:
+  - "inferencemodels"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
+- apiGroups:
+  - ""
+  resources:
+  - "pods"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
+- apiGroups:
+  - "inference.networking.x-k8s.io"
+  resources:
+  - "inferencepools"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
+- apiGroups:
+  - "discovery.k8s.io"
+  resources:
+  - "endpointslices"
+  verbs:
+  - "get"
+  - "watch"
+  - "list"
+- apiGroups:
+  - "authentication.k8s.io"
+  resources:
+  - "tokenreviews"
+  verbs:
+  - "create"
+- apiGroups:
+  - "authorization.k8s.io"
+  resources:
+  - "subjectaccessreviews"
+  verbs:
+  - "create"
+--- 
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: pod-read-binding
+subjects:
+- kind: ServiceAccount
+  name: default
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: pod-read
diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml
@@ -1,86 +1,29 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-30801
-  labels:
-    app: vllm-30801
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vllm-30801
-  template:
-    metadata:
-      labels:
-        app: vllm-30801
-        ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30801
-    spec:
-      containers:
-      - name: vllm
-        image: vllm-sim/vllm-sim:latest
-        args:
-        - "--port=30801"
-        - "--model=model1"
-        - "--lora=lora1,lora2"
-        ports:
-          - containerPort: 30801
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-30802
-  labels:
-    app: vllm-30802
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vllm-30802
-  template:
-    metadata:
-      labels:
-        app: vllm-30802
-        ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30802
-    spec:
-      containers:
-      - name: vllm
-        image: vllm-sim/vllm-sim:latest
-        args:
-        - "--port=30802"
-        - "--model=model1"
-        - "--lora=lora1,lora2"
-        ports:
-          - containerPort: 30802
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-30803
+  name: vllm-sim
   labels:
-    app: vllm-30803
+    app: vllm-llama3-8b-instruct
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-30803
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-30803
+        app: vllm-llama3-8b-instruct
         ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30803
     spec:
       containers:
       - name: vllm
-        image: vllm-sim/vllm-sim:latest
+        image: quay.io/vllm-d/vllm-sim:0.0.1
+        imagePullPolicy: IfNotPresent
         args:
-        - "--port=30803"
-        - "--model=model2"
-        - "--lora=lora3"
+        - "--port=8000"
+        - "--model=food-review"
+        # - "--lora=lora10,lora20,lora30"
+        # - "--time-to-first-token=500"
         ports:
-          - containerPort: 30803
+          - containerPort: 8000