Skip to content
This repository was archived by the owner on May 15, 2025. It is now read-only.

DO NOT MERGE: Local installation script #25

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions deploy/components/inference-gateway/inferencemodel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: food-review
spec:
modelName: food-review
criticality: Critical
poolRef:
name: vllm-llama3-8b-instruct
126 changes: 126 additions & 0 deletions deploy/components/inference-gateway/inferencepool.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Note: If you change this file, please also change the file used for e2e tests!
#
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
labels:
name: vllm-llama3-8b-instruct
spec:
targetPortNumber: 8000
selector:
app: vllm-llama3-8b-instruct
extensionRef:
name: vllm-llama3-8b-instruct-epp
---
apiVersion: v1
kind: Service
metadata:
name: vllm-llama3-8b-instruct-epp
namespace: default
spec:
selector:
app: vllm-llama3-8b-instruct-epp
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama3-8b-instruct-epp
namespace: default
labels:
app: vllm-llama3-8b-instruct-epp
spec:
replicas: 1
selector:
matchLabels:
app: vllm-llama3-8b-instruct-epp
template:
metadata:
labels:
app: vllm-llama3-8b-instruct-epp
spec:
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
terminationGracePeriodSeconds: 130
containers:
- name: epp
# image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
image: gateway-api-inference-extension/epp:demo
imagePullPolicy: IfNotPresent
args:
- -refreshMetricsInterval
- "500ms"
- -poolName
- "vllm-llama3-8b-instruct"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- "9002"
- -grpcHealthPort
- "9003"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read-binding
subjects:
- kind: ServiceAccount
name: default
namespace: default
roleRef:
kind: ClusterRole
name: pod-read
79 changes: 11 additions & 68 deletions deploy/components/vllm-sim/deployments.yaml
Original file line number Diff line number Diff line change
@@ -1,86 +1,29 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30801
labels:
app: vllm-30801
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30801
template:
metadata:
labels:
app: vllm-30801
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30801
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
args:
- "--port=30801"
- "--model=model1"
- "--lora=lora1,lora2"
ports:
- containerPort: 30801
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30802
labels:
app: vllm-30802
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30802
template:
metadata:
labels:
app: vllm-30802
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30802
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
args:
- "--port=30802"
- "--model=model1"
- "--lora=lora1,lora2"
ports:
- containerPort: 30802
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-30803
name: vllm-sim
labels:
app: vllm-30803
app: vllm-llama3-8b-instruct
spec:
replicas: 1
selector:
matchLabels:
app: vllm-30803
app: vllm-llama3-8b-instruct
template:
metadata:
labels:
app: vllm-30803
app: vllm-llama3-8b-instruct
ai-aware-router-pod: "true"
annotations:
ai-aware-router-address: 127.0.0.1:30803
spec:
containers:
- name: vllm
image: vllm-sim/vllm-sim:latest
image: quay.io/vllm-d/vllm-sim:0.0.1
imagePullPolicy: IfNotPresent
args:
- "--port=30803"
- "--model=model2"
- "--lora=lora3"
- "--port=8000"
- "--model=food-review"
# - "--lora=lora10,lora20,lora30"
# - "--time-to-first-token=500"
ports:
- containerPort: 30803
- containerPort: 8000
150 changes: 150 additions & 0 deletions scripts/setup_script_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
set -euo pipefail

# ----------------------------------------
# Variables
# ----------------------------------------
CLUSTER_NAME="inference-router"
KIND_CONFIG="kind-config.yaml"
#VLLM_IMAGE="public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.0"
#KGATEWAY_IMAGE="cr.kgateway.dev/kgateway-dev/envoy-wrapper:v2.0.0"
METALLB_VERSION="v0.14.9"
INFERENCE_VERSION="v0.3.0"
KGTW_VERSION="v2.0.0"
SRC_DIR="$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)"

# ----------------------------------------
# Step 1: Create Kind Cluster
# ----------------------------------------
echo "🛠️ Creating Kind cluster..."
kind delete cluster --name "$CLUSTER_NAME"
kind create cluster --name "$CLUSTER_NAME" --config "$KIND_CONFIG"

echo "📦 Loading vLLM SIMULATOR image..."
tput bold
echo "Build vLLM-sim image and load to kind cluster:"
tput sgr0
echo ""
cd $SRC_DIR/../vllm-sim
make build-vllm-sim-image
kind load docker-image vllm-sim/vllm-sim:0.0.2 --name "$CLUSTER_NAME"

# ----------------------------------------
# Step 2: Install MetalLB
# ----------------------------------------
echo "🌐 Installing MetalLB..."
kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/${METALLB_VERSION}/config/manifests/metallb-native.yaml
echo "⏳ Waiting for MetalLB pods to be ready..."
kubectl wait --namespace metallb-system \
--for=condition=Ready pod \
--selector=component=controller \
--timeout=120s

kubectl wait --namespace metallb-system \
--for=condition=Ready pod \
--selector=component=speaker \
--timeout=120s

echo "⚙️ Applying MetalLB config..."
kubectl apply -f metalb-config.yaml

# ----------------------------------------
# Step 3: vLLM
# ----------------------------------------
tput bold
echo "deploy vllm-sim model servers:"
tput sgr0
echo ""
#kubectl apply -f $SRC_DIR/manifests/vllm-sim.yaml
kubectl apply -f $SRC_DIR/vllm-sim.yaml



# ----------------------------------------
# Step 4: Deploy Inference API Components
# ----------------------------------------
# TODO - use our yamls
echo "📡 Installing Inference API..."
kubectl apply -f "https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_VERSION}/manifests.yaml"

#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml
#kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml

kubectl apply -f $SRC_DIR/inferencemodel-local.yaml

# build and load extention image
cd $SRC_DIR/../gateway-api-inference-extension_maya
IMAGE_REGISTRY="gateway-api-inference-extension" GIT_TAG="demo" make image-load
kind load docker-image gateway-api-inference-extension/epp:demo --name "$CLUSTER_NAME"
kubectl delete -f $SRC_DIR/inferencepool-resources-local.yaml
kubectl apply -f $SRC_DIR/inferencepool-resources-local.yaml

# ----------------------------------------
# Step 5: Install Kgateway
# ----------------------------------------
echo "🚪 Installing Kgateway..."
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.0/standard-install.yaml
helm upgrade -i --create-namespace --namespace kgateway-system --version "$KGTW_VERSION" kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds
helm upgrade -i --namespace kgateway-system --version "$KGTW_VERSION" kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true

# ----------------------------------------
# Step 6: Apply Gateway and Routes
# ----------------------------------------
echo "📨 Applying Gateway and HTTPRoute..."
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml

echo "📨 Wait Gatewayto be ready..."
# sleep 30 # Give time for pod to create
# kubectl wait --for=condition=Ready pod --selector=app.kubernetes.io/instance=inference-gateway --timeout=240s
# Wait up to 2 minutes for the Gateway to get an IP
for i in {1..24}; do
IP=$(kubectl get gateway inference-gateway -o jsonpath='{.status.addresses[0].value}' 2>/dev/null || echo "")
if [[ -n "$IP" ]]; then
echo "✅ Gateway IP assigned: $IP"
break
fi
echo "⏳ Still waiting for Gateway IP..."
sleep 5
done

if [[ -z "$IP" ]]; then
echo "❌ Timed out waiting for Gateway IP."
exit 1
fi

# ----------------------------------------
# Step 7: Run Inference Request
# ----------------------------------------
echo "🔍 Fetching Gateway IP..."
sleep 5 # Give time for IP allocation
IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}')
PORT=80

echo "📨 Sending test inference request to $IP:$PORT..."
curl -i "${IP}:${PORT}/v1/completions" \
-H 'Content-Type: application/json' \
-d '{
"model": "Qwen/Qwen2.5-1.5B-Instruct",
"prompt": "hi",
"max_tokens": 10,
"temperature": 0
}'


curl -si -X GET "${IP}:${PORT}/v1/models" -H 'Content-Type: application/json'

curl -i -X GET "172.18.255.1:80/v1/models" -H 'Content-Type: application/json'

curl -i "172.18.255.1:80/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0 }'

curl -i "localhost:8888/v1/completions" -H 'Content-Type: application/json' -d '{ "model": "food-review", "prompt": "hi", "max_tokens": 10, "temperature": 0 }'

curl -i "172.18.255.1:80/v1/completions" \
-H 'Content-Type: application/json' \
-d '{
"model": "food-review",
"prompt": "hi",
"max_tokens": 10,
"temperature": 0
}'