diff --git a/deploy/components/inference-gateway/inferencemodel.yaml b/deploy/components/inference-gateway/inferencemodel.yaml new file mode 100644 index 000000000..f729407e9 --- /dev/null +++ b/deploy/components/inference-gateway/inferencemodel.yaml @@ -0,0 +1,9 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: food-review +spec: + modelName: food-review + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct diff --git a/deploy/components/inference-gateway/inferencepool.yaml b/deploy/components/inference-gateway/inferencepool.yaml new file mode 100644 index 000000000..e2fceaf1d --- /dev/null +++ b/deploy/components/inference-gateway/inferencepool.yaml @@ -0,0 +1,126 @@ +# Note: If you change this file, please also change the file used for e2e tests! +# +# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + labels: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: default +spec: + selector: + app: vllm-llama3-8b-instruct-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: default + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp +# image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: gateway-api-inference-extension/epp:demo + imagePullPolicy: IfNotPresent + args: + - -refreshMetricsInterval + - "500ms" + - -poolName + - "vllm-llama3-8b-instruct" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default + namespace: default +roleRef: + kind: ClusterRole + name: pod-read diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml index e7c981cfa..16a299f6e 100644 --- a/deploy/components/vllm-sim/deployments.yaml +++ b/deploy/components/vllm-sim/deployments.yaml @@ -1,86 +1,29 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-30801 - labels: - app: vllm-30801 -spec: - replicas: 1 - selector: - matchLabels: - app: vllm-30801 - template: - metadata: - labels: - app: vllm-30801 - ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30801 - spec: - containers: - - name: vllm - image: vllm-sim/vllm-sim:latest - args: - - "--port=30801" - - "--model=model1" - - "--lora=lora1,lora2" - ports: - - containerPort: 30801 ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-30802 - labels: - app: vllm-30802 -spec: - replicas: 1 - selector: - matchLabels: - app: vllm-30802 - template: - metadata: - labels: - app: vllm-30802 - ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30802 - spec: - containers: - - name: vllm - image: vllm-sim/vllm-sim:latest - args: - - "--port=30802" - - "--model=model1" - - "--lora=lora1,lora2" - ports: - - containerPort: 30802 --- apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-30803 + name: vllm-sim labels: - app: vllm-30803 + app: vllm-llama3-8b-instruct spec: replicas: 1 selector: matchLabels: - app: vllm-30803 + app: vllm-llama3-8b-instruct template: metadata: labels: - app: vllm-30803 + app: vllm-llama3-8b-instruct ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30803 spec: containers: - name: vllm - image: vllm-sim/vllm-sim:latest + image: quay.io/vllm-d/vllm-sim:0.0.1 + imagePullPolicy: IfNotPresent args: - - "--port=30803" - - "--model=model2" - - "--lora=lora3" + - "--port=8000" + - "--model=food-review" + # - "--lora=lora10,lora20,lora30" + # - "--time-to-first-token=500" ports: - - containerPort: 30803 + - containerPort: 8000 diff --git a/scripts/setup_script_local.sh b/scripts/setup_script_local.sh new file mode 100644 index 000000000..b689e9a1b --- /dev/null +++ b/scripts/setup_script_local.sh @@ -0,0 +1,150 @@ +#!/bin/bash +set -euo pipefail + +# ---------------------------------------- +# Variables +# ---------------------------------------- +CLUSTER_NAME="inference-router" +KIND_CONFIG="kind-config.yaml" +#VLLM_IMAGE="public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.0" +#KGATEWAY_IMAGE="cr.kgateway.dev/kgateway-dev/envoy-wrapper:v2.0.0" +METALLB_VERSION="v0.14.9" +INFERENCE_VERSION="v0.3.0" +KGTW_VERSION="v2.0.0" +SRC_DIR="$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)" + +# ---------------------------------------- +# Step 1: Create Kind Cluster +# ---------------------------------------- +echo "🛠️ Creating Kind cluster..." +kind delete cluster --name "$CLUSTER_NAME" +kind create cluster --name "$CLUSTER_NAME" --config "$KIND_CONFIG" + +echo "📦 Loading vLLM SIMULATOR image..." +tput bold +echo "Build vLLM-sim image and load to kind cluster:" +tput sgr0 +echo "" +cd $SRC_DIR/../vllm-sim +make build-vllm-sim-image +kind load docker-image vllm-sim/vllm-sim:0.0.2 --name "$CLUSTER_NAME" + +# ---------------------------------------- +# Step 2: Install MetalLB +# ---------------------------------------- +echo "🌐 Installing MetalLB..." +kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/${METALLB_VERSION}/config/manifests/metallb-native.yaml +echo "⏳ Waiting for MetalLB pods to be ready..." +kubectl wait --namespace metallb-system \ + --for=condition=Ready pod \ + --selector=component=controller \ + --timeout=120s + +kubectl wait --namespace metallb-system \ + --for=condition=Ready pod \ + --selector=component=speaker \ + --timeout=120s + +echo "⚙️ Applying MetalLB config..." +kubectl apply -f metalb-config.yaml + +# ---------------------------------------- +# Step 3: vLLM +# ---------------------------------------- +tput bold +echo "deploy vllm-sim model servers:" +tput sgr0 +echo "" +#kubectl apply -f $SRC_DIR/manifests/vllm-sim.yaml +kubectl apply -f $SRC_DIR/vllm-sim.yaml + + + +# ---------------------------------------- +# Step 4: Deploy Inference API Components +# ---------------------------------------- +# TODO - use our yamls +echo "📡 Installing Inference API..." +kubectl apply -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_VERSION}/manifests.yaml" + +#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml +#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml + +kubectl apply -f $SRC_DIR/inferencemodel-local.yaml + +# build and load extention image +cd $SRC_DIR/../gateway-api-inference-extension_maya +IMAGE_REGISTRY="gateway-api-inference-extension" GIT_TAG="demo" make image-load +kind load docker-image gateway-api-inference-extension/epp:demo --name "$CLUSTER_NAME" +kubectl delete -f $SRC_DIR/inferencepool-resources-local.yaml +kubectl apply -f $SRC_DIR/inferencepool-resources-local.yaml + +# ---------------------------------------- +# Step 5: Install Kgateway +# ---------------------------------------- +echo "🚪 Installing Kgateway..." +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml +helm upgrade -i --create-namespace --namespace kgateway-system --version "$KGTW_VERSION" kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds +helm upgrade -i --namespace kgateway-system --version "$KGTW_VERSION" kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true + +# ---------------------------------------- +# Step 6: Apply Gateway and Routes +# ---------------------------------------- +echo "📨 Applying Gateway and HTTPRoute..." +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml +kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml + +echo "📨 Wait Gatewayto be ready..." +# sleep 30 # Give time for pod to create +# kubectl wait --for=condition=Ready pod --selector=app.kubernetes.io/instance=inference-gateway --timeout=240s +# Wait up to 2 minutes for the Gateway to get an IP +for i in {1..24}; do + IP=$(kubectl get gateway inference-gateway -o jsonpath='{.status.addresses[0].value}' 2>/dev/null || echo "") + if [[ -n "$IP" ]]; then + echo "✅ Gateway IP assigned: $IP" + break + fi + echo "⏳ Still waiting for Gateway IP..." + sleep 5 +done + +if [[ -z "$IP" ]]; then + echo "❌ Timed out waiting for Gateway IP." + exit 1 +fi + +# ---------------------------------------- +# Step 7: Run Inference Request +# ---------------------------------------- +echo "🔍 Fetching Gateway IP..." +sleep 5 # Give time for IP allocation +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') +PORT=80 + +echo "📨 Sending test inference request to $IP:$PORT..." +curl -i "${IP}:${PORT}/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "hi", + "max_tokens": 10, + "temperature": 0 + }' + + +curl -si -X GET "${IP}:${PORT}/v1/models" -H 'Content-Type: application/json' + +curl -i -X GET "172.18.255.1:80/v1/models" -H 'Content-Type: application/json' + +curl -i "172.18.255.1:80/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0 }' + +curl -i "localhost:8888/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0 }' + +curl -i "172.18.255.1:80/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "food-review", + "prompt": "hi", + "max_tokens": 10, + "temperature": 0 + }' \ No newline at end of file