neuralmagic · mayabar · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
diff --git a/deploy/components/inference-gateway/inferencemodel.yaml b/deploy/components/inference-gateway/inferencemodel.yaml
@@ -0,0 +1,9 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: food-review
+spec:
+  modelName: food-review
+  criticality: Critical
+  poolRef:
+    name: vllm-llama3-8b-instruct
diff --git a/deploy/components/inference-gateway/inferencepool.yaml b/deploy/components/inference-gateway/inferencepool.yaml
@@ -0,0 +1,126 @@
+# Note: If you change this file, please also change the file used for e2e tests!
+# 
+# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  labels:
+  name: vllm-llama3-8b-instruct
+spec:
+  targetPortNumber: 8000
+  selector:
+    app: vllm-llama3-8b-instruct
+  extensionRef:
+    name: vllm-llama3-8b-instruct-epp
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-llama3-8b-instruct-epp
+  namespace: default
+spec:
+  selector:
+    app: vllm-llama3-8b-instruct-epp
+  ports:
+    - protocol: TCP
+      port: 9002
+      targetPort: 9002
+      appProtocol: http2
+  type: ClusterIP
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-llama3-8b-instruct-epp
+  namespace: default
+  labels:
+    app: vllm-llama3-8b-instruct-epp
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-llama3-8b-instruct-epp
+  template:
+    metadata:
+      labels:
+        app: vllm-llama3-8b-instruct-epp
+    spec:
+      # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
+      terminationGracePeriodSeconds: 130
+      containers:
+      - name: epp
+#        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        image: gateway-api-inference-extension/epp:demo
+        imagePullPolicy: IfNotPresent
+        args:
+        - -refreshMetricsInterval
+        - "500ms"
+        - -poolName
+        - "vllm-llama3-8b-instruct"
+        - -v
+        - "4"
+        - --zap-encoder
+        - "json"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        ports:
+        - containerPort: 9002
+        - containerPort: 9003
+        - name: metrics
+          containerPort: 9090
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read
+rules:
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencemodels"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "watch", "list"]
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+--- 
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: pod-read-binding
+subjects:
+- kind: ServiceAccount
+  name: default
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: pod-read
diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml
@@ -1,86 +1,29 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-30801
-  labels:
-    app: vllm-30801
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vllm-30801
-  template:
-    metadata:
-      labels:
-        app: vllm-30801
-        ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30801
-    spec:
-      containers:
-      - name: vllm
-        image: vllm-sim/vllm-sim:latest
-        args:
-        - "--port=30801"
-        - "--model=model1"
-        - "--lora=lora1,lora2"
-        ports:
-          - containerPort: 30801
----
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: vllm-30802
-  labels:
-    app: vllm-30802
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: vllm-30802
-  template:
-    metadata:
-      labels:
-        app: vllm-30802
-        ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30802
-    spec:
-      containers:
-      - name: vllm
-        image: vllm-sim/vllm-sim:latest
-        args:
-        - "--port=30802"
-        - "--model=model1"
-        - "--lora=lora1,lora2"
-        ports:
-          - containerPort: 30802
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: vllm-30803
+  name: vllm-sim
   labels:
-    app: vllm-30803
+    app: vllm-llama3-8b-instruct
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: vllm-30803
+      app: vllm-llama3-8b-instruct
   template:
     metadata:
       labels:
-        app: vllm-30803
+        app: vllm-llama3-8b-instruct
         ai-aware-router-pod: "true"
-      annotations:
-        ai-aware-router-address: 127.0.0.1:30803
     spec:
       containers:
       - name: vllm
-        image: vllm-sim/vllm-sim:latest
+        image: quay.io/vllm-d/vllm-sim:0.0.1
+        imagePullPolicy: IfNotPresent
         args:
-        - "--port=30803"
-        - "--model=model2"
-        - "--lora=lora3"
+        - "--port=8000"
+        - "--model=food-review"
+        # - "--lora=lora10,lora20,lora30"
+        # - "--time-to-first-token=500"
         ports:
-          - containerPort: 30803
+          - containerPort: 8000
diff --git a/scripts/setup_script_local.sh b/scripts/setup_script_local.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+set -euo pipefail
+
+# ----------------------------------------
+# Variables
+# ----------------------------------------
+CLUSTER_NAME="inference-router"
+KIND_CONFIG="kind-config.yaml"
+#VLLM_IMAGE="public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.0"
+#KGATEWAY_IMAGE="cr.kgateway.dev/kgateway-dev/envoy-wrapper:v2.0.0"
+METALLB_VERSION="v0.14.9"
+INFERENCE_VERSION="v0.3.0"
+KGTW_VERSION="v2.0.0"
+SRC_DIR="$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)"
+
+# ----------------------------------------
+# Step 1: Create Kind Cluster
+# ----------------------------------------
+echo "🛠️  Creating Kind cluster..."
+kind delete cluster --name "$CLUSTER_NAME"
+kind create cluster --name "$CLUSTER_NAME" --config "$KIND_CONFIG"
+
+echo "📦  Loading vLLM SIMULATOR image..."
+tput bold
+echo "Build vLLM-sim image and load to kind cluster:"
+tput sgr0
+echo ""
+cd $SRC_DIR/../vllm-sim
+make build-vllm-sim-image
+kind load docker-image vllm-sim/vllm-sim:0.0.2 --name "$CLUSTER_NAME"
+
+# ----------------------------------------
+# Step 2: Install MetalLB
+# ----------------------------------------
+echo "🌐  Installing MetalLB..."
+kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/${METALLB_VERSION}/config/manifests/metallb-native.yaml
+echo "⏳  Waiting for MetalLB pods to be ready..."
+kubectl wait --namespace metallb-system \
+  --for=condition=Ready pod \
+  --selector=component=controller \
+  --timeout=120s
+
+kubectl wait --namespace metallb-system \
+  --for=condition=Ready pod \
+  --selector=component=speaker \
+  --timeout=120s
+
+echo "⚙️  Applying MetalLB config..."
+kubectl apply -f metalb-config.yaml
+
+# ----------------------------------------
+# Step 3: vLLM
+# ----------------------------------------
+tput bold
+echo "deploy vllm-sim model servers:"
+tput sgr0
+echo ""
+#kubectl apply -f $SRC_DIR/manifests/vllm-sim.yaml
+kubectl apply -f $SRC_DIR/vllm-sim.yaml
+
+
+
+# ----------------------------------------
+# Step 4: Deploy Inference API Components
+# ----------------------------------------
+# TODO - use our yamls
+echo "📡  Installing Inference API..."
+kubectl apply -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_VERSION}/manifests.yaml"
+
+#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml
+#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml
+
+kubectl apply -f $SRC_DIR/inferencemodel-local.yaml
+
+# build and load extention image
+cd $SRC_DIR/../gateway-api-inference-extension_maya
+IMAGE_REGISTRY="gateway-api-inference-extension"  GIT_TAG="demo" make image-load
+kind load docker-image gateway-api-inference-extension/epp:demo --name "$CLUSTER_NAME"
+kubectl delete -f $SRC_DIR/inferencepool-resources-local.yaml
+kubectl apply -f $SRC_DIR/inferencepool-resources-local.yaml
+
+# ----------------------------------------
+# Step 5: Install Kgateway
+# ----------------------------------------
+echo "🚪  Installing Kgateway..."
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml
+helm upgrade -i --create-namespace --namespace kgateway-system --version "$KGTW_VERSION" kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds
+helm upgrade -i --namespace kgateway-system --version "$KGTW_VERSION" kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true
+
+# ----------------------------------------
+# Step 6: Apply Gateway and Routes
+# ----------------------------------------
+echo "📨  Applying Gateway and HTTPRoute..."
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml
+
+echo "📨  Wait Gatewayto be ready..."
+# sleep 30  # Give time for pod to create
+# kubectl wait --for=condition=Ready pod --selector=app.kubernetes.io/instance=inference-gateway --timeout=240s
+# Wait up to 2 minutes for the Gateway to get an IP
+for i in {1..24}; do
+  IP=$(kubectl get gateway inference-gateway -o jsonpath='{.status.addresses[0].value}' 2>/dev/null || echo "")
+  if [[ -n "$IP" ]]; then
+    echo "✅  Gateway IP assigned: $IP"
+    break
+  fi
+  echo "⏳  Still waiting for Gateway IP..."
+  sleep 5
+done
+
+if [[ -z "$IP" ]]; then
+  echo "❌  Timed out waiting for Gateway IP."
+  exit 1
+fi
+
+# ----------------------------------------
+# Step 7: Run Inference Request
+# ----------------------------------------
+echo "🔍  Fetching Gateway IP..."
+sleep 5  # Give time for IP allocation
+IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
+PORT=80
+
+echo "📨  Sending test inference request to $IP:$PORT..."
+curl -i "${IP}:${PORT}/v1/completions" \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "Qwen/Qwen2.5-1.5B-Instruct",
+    "prompt": "hi",
+    "max_tokens": 10,
+    "temperature": 0
+  }'
+
+
+curl -si -X GET "${IP}:${PORT}/v1/models"  -H 'Content-Type: application/json'
+
+curl -i -X GET "172.18.255.1:80/v1/models"  -H 'Content-Type: application/json'
+
+curl -i "172.18.255.1:80/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0  }'
+
+curl -i "localhost:8888/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0  }'
+
+curl -i "172.18.255.1:80/v1/completions" \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "food-review",
+    "prompt": "hi",
+    "max_tokens": 10,
+    "temperature": 0
+  }'