diff --git a/deploy/common/patch-service.yaml b/deploy/components/extra/openshift-router/common/patch-service.yaml similarity index 100% rename from deploy/common/patch-service.yaml rename to deploy/components/extra/openshift-router/common/patch-service.yaml diff --git a/deploy/common/patch-statefulset.yaml b/deploy/components/extra/openshift-router/common/patch-statefulset.yaml similarity index 100% rename from deploy/common/patch-statefulset.yaml rename to deploy/components/extra/openshift-router/common/patch-statefulset.yaml diff --git a/deploy/common/service.yaml b/deploy/components/extra/openshift-router/common/service.yaml similarity index 100% rename from deploy/common/service.yaml rename to deploy/components/extra/openshift-router/common/service.yaml diff --git a/deploy/common/statefulset.yaml b/deploy/components/extra/openshift-router/common/statefulset.yaml similarity index 100% rename from deploy/common/statefulset.yaml rename to deploy/components/extra/openshift-router/common/statefulset.yaml diff --git a/deploy/kustomization.yaml b/deploy/components/extra/openshift-router/kustomization.yaml similarity index 100% rename from deploy/kustomization.yaml rename to deploy/components/extra/openshift-router/kustomization.yaml diff --git a/deploy/openshift/patch-route.yaml b/deploy/components/extra/openshift-router/openshift/patch-route.yaml similarity index 100% rename from deploy/openshift/patch-route.yaml rename to deploy/components/extra/openshift-router/openshift/patch-route.yaml diff --git a/deploy/openshift/route.yaml b/deploy/components/extra/openshift-router/openshift/route.yaml similarity index 100% rename from deploy/openshift/route.yaml rename to deploy/components/extra/openshift-router/openshift/route.yaml diff --git a/deploy/rbac/exec-rbac-role.yaml b/deploy/components/extra/openshift-router/rbac/exec-rbac-role.yaml similarity index 100% rename from deploy/rbac/exec-rbac-role.yaml rename to deploy/components/extra/openshift-router/rbac/exec-rbac-role.yaml diff --git a/deploy/rbac/exec-rbac-rolebinding.yaml b/deploy/components/extra/openshift-router/rbac/exec-rbac-rolebinding.yaml similarity index 100% rename from deploy/rbac/exec-rbac-rolebinding.yaml rename to deploy/components/extra/openshift-router/rbac/exec-rbac-rolebinding.yaml diff --git a/deploy/rbac/patch-rbac-role.yaml b/deploy/components/extra/openshift-router/rbac/patch-rbac-role.yaml similarity index 100% rename from deploy/rbac/patch-rbac-role.yaml rename to deploy/components/extra/openshift-router/rbac/patch-rbac-role.yaml diff --git a/deploy/rbac/patch-rbac-rolebinding.yaml b/deploy/components/extra/openshift-router/rbac/patch-rbac-rolebinding.yaml similarity index 100% rename from deploy/rbac/patch-rbac-rolebinding.yaml rename to deploy/components/extra/openshift-router/rbac/patch-rbac-rolebinding.yaml diff --git a/deploy/components/inference-gateway/configmaps.yaml b/deploy/components/inference-gateway/configmaps.yaml new file mode 100644 index 000000000..73b3f022e --- /dev/null +++ b/deploy/components/inference-gateway/configmaps.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: endpoint-picker-config +data: + config.yaml: | + pod_selector: + ai-aware-router-pod: true + routing_filters: + routing_scorers: + - name: session-affinity + weight: 60 + - name: route-by-active-lora + weight: 50 + routing_header: x-ai-aware-router-routing + session_id_header: x-ai-aware-router-session-id + listening_port: 9080 + inference_port: 8000 diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml new file mode 100644 index 000000000..0ec227967 --- /dev/null +++ b/deploy/components/inference-gateway/deployments.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: endpoint-picker +spec: + replicas: 1 + selector: + matchLabels: + app: endpoint-picker + template: + metadata: + labels: + app: endpoint-picker + spec: + serviceAccountName: endpoint-picker + containers: + - name: endpoint-picker + image: inference-router/router-ext-proc:latest + args: + - "--config-file" + - "/etc/endpoint-picker/config.yaml" + ports: + - name: grpc + containerPort: 9080 + protocol: TCP + volumeMounts: + - name: endpoint-picker-config + mountPath: /etc/endpoint-picker + volumes: + - name: endpoint-picker-config + configMap: + name: endpoint-picker-config diff --git a/deploy/components/inference-gateway/envoy-filters.yaml b/deploy/components/inference-gateway/envoy-filters.yaml new file mode 100644 index 000000000..e9a4fec5a --- /dev/null +++ b/deploy/components/inference-gateway/envoy-filters.yaml @@ -0,0 +1,31 @@ +apiVersion: networking.istio.io/v1alpha3 +kind: EnvoyFilter +metadata: + name: endpoint-picker +spec: + configPatches: + - applyTo: HTTP_FILTER + match: + listener: + filterChain: + filter: + name: "envoy.filters.network.http_connection_manager" + patch: + operation: INSERT_FIRST + value: + name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + failure_mode_allow: false + allow_mode_override: true + processing_mode: + request_header_mode: "SEND" + response_header_mode: "SEND" + request_body_mode: "BUFFERED" + response_body_mode: "BUFFERED" + request_trailer_mode: "SEND" + response_trailer_mode: "SKIP" + grpc_service: + envoy_grpc: + cluster_name: outbound|9080||endpoint-picker.REPLACE_NAMESPACE.svc.cluster.local + timeout: 5s diff --git a/deploy/components/inference-gateway/gateways.yaml b/deploy/components/inference-gateway/gateways.yaml new file mode 100644 index 000000000..2a83b95ee --- /dev/null +++ b/deploy/components/inference-gateway/gateways.yaml @@ -0,0 +1,14 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway + labels: + istio.io/rev: istio-control-plane + annotations: + networking.istio.io/service-type: ClusterIP +spec: + gatewayClassName: istio + listeners: + - name: default + port: 80 + protocol: HTTP diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml new file mode 100644 index 000000000..10b898cb7 --- /dev/null +++ b/deploy/components/inference-gateway/kustomization.yaml @@ -0,0 +1,30 @@ +# ------------------------------------------------------------------------------ +# Inference Gateway +# +# This deploys a Gateway and the Endpoint Picker (EPP), and attaches the EPP to +# the Gateway with an EnvoyFilter. +# +# Add an HTTPRoute to route traffic to VLLM, or a VLLM simulator. +# +# **WARNING**: The EnvoyFilter contains a variable that needs to be replaced +# with the namespace to match the EPP's Service. For now use sed to replace it, +# e.g.: +# +# $ kubectl kustomize deploy/components/inference-gateway \ +# | sed 's/REPLACE_NAMESPACE/mynamespace/gI' \ +# | kubectl -n mynamespace apply -f - +# ------------------------------------------------------------------------------ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- configmaps.yaml +- deployments.yaml +- services.yaml +- rbac.yaml +- gateways.yaml +- envoy-filters.yaml + +images: +- name: inference-router/router-ext-proc + newTag: 0.0.1 diff --git a/deploy/components/inference-gateway/rbac.yaml b/deploy/components/inference-gateway/rbac.yaml new file mode 100644 index 000000000..1b457dc87 --- /dev/null +++ b/deploy/components/inference-gateway/rbac.yaml @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: endpoint-picker +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: endpoint-picker +rules: + - apiGroups: + - "" + resources: + - "pods" + verbs: + - "get" + - "list" + - "watch" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: endpoint-picker-binding +subjects: + - kind: ServiceAccount + name: endpoint-picker +roleRef: + kind: Role + name: endpoint-picker + apiGroup: rbac.authorization.k8s.io + diff --git a/deploy/components/inference-gateway/services.yaml b/deploy/components/inference-gateway/services.yaml new file mode 100644 index 000000000..d8d5d5b1e --- /dev/null +++ b/deploy/components/inference-gateway/services.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: endpoint-picker +spec: + type: ClusterIP + selector: + app: endpoint-picker + ports: + - name: grpc + protocol: TCP + port: 9080 + targetPort: 9080 diff --git a/deploy/components/istio-control-plane/control-plane.yaml b/deploy/components/istio-control-plane/control-plane.yaml new file mode 100644 index 000000000..2dcf3face --- /dev/null +++ b/deploy/components/istio-control-plane/control-plane.yaml @@ -0,0 +1,13 @@ +apiVersion: sailoperator.io/v1 +kind: Istio +metadata: + name: control-plane +spec: + version: v1.25-latest + values: + pilot: + resources: + requests: + cpu: 100m + memory: 1024Mi + diff --git a/deploy/components/istio-control-plane/kustomization.yaml b/deploy/components/istio-control-plane/kustomization.yaml new file mode 100644 index 000000000..89e9b06e1 --- /dev/null +++ b/deploy/components/istio-control-plane/kustomization.yaml @@ -0,0 +1,15 @@ +# ------------------------------------------------------------------------------ +# Istio Control Plane +# +# This deploys an Istio control-plane for the entire cluster. This enables the +# creation of Gateways. +# ------------------------------------------------------------------------------ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: istio-system +namePrefix: istio- + +resources: +- namespaces.yaml +- control-plane.yaml diff --git a/deploy/components/istio-control-plane/namespaces.yaml b/deploy/components/istio-control-plane/namespaces.yaml new file mode 100644 index 000000000..1ab3a7255 --- /dev/null +++ b/deploy/components/istio-control-plane/namespaces.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: system diff --git a/deploy/components/sail-operator/.gitignore b/deploy/components/sail-operator/.gitignore new file mode 100644 index 000000000..ee3892e87 --- /dev/null +++ b/deploy/components/sail-operator/.gitignore @@ -0,0 +1 @@ +charts/ diff --git a/deploy/components/sail-operator/kustomization.yaml b/deploy/components/sail-operator/kustomization.yaml new file mode 100644 index 000000000..d50bcadc4 --- /dev/null +++ b/deploy/components/sail-operator/kustomization.yaml @@ -0,0 +1,32 @@ +# ------------------------------------------------------------------------------ +# Istio Sail Operator +# +# This deploys the Istio Sail Operator via Helm chart to enable the creation +# of Istio Control Planes, and ultimately Gateways. This will also deploy all +# the Istio and Gateway API CRDs. +# +# This is required on Kubernetes clusters, and OpenShift clusters versions +# below 4.19 (OpenShift 4.19+ includes all this by default). +# +# **Warning**: This needs to be deployed before, and separately from other +# components as it deploys CRDs. It can be deployed with: +# +# $ kubectl kustomize --enable-helm deploy/components/sail-operator/ \ +# | kubectl apply --server-side --force-conflicts -f - +# +# ------------------------------------------------------------------------------ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: sail-operator + +resources: +- https://github.com/kubernetes-sigs/gateway-api/config/crd?ref=v1.2.1 +- namespaces.yaml + +helmCharts: +- name: sail-operator + namespace: sail-operator + repo: https://istio-ecosystem.github.io/sail-operator + version: 1.25.1 + includeCRDs: true diff --git a/deploy/components/sail-operator/namespaces.yaml b/deploy/components/sail-operator/namespaces.yaml new file mode 100644 index 000000000..ddc027d84 --- /dev/null +++ b/deploy/components/sail-operator/namespaces.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: sail-operator diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml new file mode 100644 index 000000000..e7c981cfa --- /dev/null +++ b/deploy/components/vllm-sim/deployments.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-30801 + labels: + app: vllm-30801 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-30801 + template: + metadata: + labels: + app: vllm-30801 + ai-aware-router-pod: "true" + annotations: + ai-aware-router-address: 127.0.0.1:30801 + spec: + containers: + - name: vllm + image: vllm-sim/vllm-sim:latest + args: + - "--port=30801" + - "--model=model1" + - "--lora=lora1,lora2" + ports: + - containerPort: 30801 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-30802 + labels: + app: vllm-30802 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-30802 + template: + metadata: + labels: + app: vllm-30802 + ai-aware-router-pod: "true" + annotations: + ai-aware-router-address: 127.0.0.1:30802 + spec: + containers: + - name: vllm + image: vllm-sim/vllm-sim:latest + args: + - "--port=30802" + - "--model=model1" + - "--lora=lora1,lora2" + ports: + - containerPort: 30802 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-30803 + labels: + app: vllm-30803 +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-30803 + template: + metadata: + labels: + app: vllm-30803 + ai-aware-router-pod: "true" + annotations: + ai-aware-router-address: 127.0.0.1:30803 + spec: + containers: + - name: vllm + image: vllm-sim/vllm-sim:latest + args: + - "--port=30803" + - "--model=model2" + - "--lora=lora3" + ports: + - containerPort: 30803 diff --git a/deploy/components/vllm-sim/kustomization.yaml b/deploy/components/vllm-sim/kustomization.yaml new file mode 100644 index 000000000..b49d7b63d --- /dev/null +++ b/deploy/components/vllm-sim/kustomization.yaml @@ -0,0 +1,17 @@ +# ------------------------------------------------------------------------------ +# VLLM Simulator +# +# This deploys a VLLM simulator which can be used to simulate inference for +# small environments (e.g. Kubernetes In Docker (KIND) clusters) or for simple +# tests. +# ------------------------------------------------------------------------------ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- deployments.yaml +- services.yaml + +images: +- name: vllm-sim/vllm-sim + newTag: 0.0.2 diff --git a/deploy/components/vllm-sim/services.yaml b/deploy/components/vllm-sim/services.yaml new file mode 100644 index 000000000..9e67d79a6 --- /dev/null +++ b/deploy/components/vllm-sim/services.yaml @@ -0,0 +1,38 @@ +kind: Service +apiVersion: v1 +metadata: + name: vllm-30801 +spec: + type: ClusterIP + selector: + app: vllm-30801 + ports: + - protocol: TCP + port: 30801 + targetPort: 30801 +--- +kind: Service +apiVersion: v1 +metadata: + name: vllm-30802 +spec: + type: ClusterIP + selector: + app: vllm-30802 + ports: + - protocol: TCP + port: 30802 + targetPort: 30802 +--- +kind: Service +apiVersion: v1 +metadata: + name: vllm-30803 +spec: + type: ClusterIP + selector: + app: vllm-30803 + ports: + - protocol: TCP + port: 30803 + targetPort: 30803 diff --git a/deploy/environments/kind/httproutes.yaml b/deploy/environments/kind/httproutes.yaml new file mode 100644 index 000000000..7ff6e56b4 --- /dev/null +++ b/deploy/environments/kind/httproutes.yaml @@ -0,0 +1,19 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: inference-route +spec: + parentRefs: + - name: inference-gateway + rules: + - matches: + - path: + type: PathPrefix + value: /v1 + backendRefs: + - name: vllm-30801 + port: 30801 + - name: vllm-30802 + port: 30802 + - name: vllm-30802 + port: 30802 diff --git a/deploy/environments/kind/kustomization.yaml b/deploy/environments/kind/kustomization.yaml new file mode 100644 index 000000000..f5f8d76cc --- /dev/null +++ b/deploy/environments/kind/kustomization.yaml @@ -0,0 +1,31 @@ +# ------------------------------------------------------------------------------ +# Kubernetes In Docker (KIND) Environment +# +# This will deploy the full development stack on a KIND cluster: +# +# * Istio Control Plane +# * VLLM Simulator +# * Inference Gateway +# +# **Note**: The Sail Operator must be deployed first. +# +# This will expose the VLLM simulator via an HTTPRoute. You can access the +# Gateway with a port-forward: +# +# $ kubectl port-forward service/inference-gateway-istio 8080:80 +# +# And the requests can be made: +# +# $ curl -v -w '\n' -X POST -H 'Content-Type: application/json' \ +# -d '{"model":"model1","messages":[{"role":"user","content":"Hello!"}]}' \ +# http://localhost:8080/v1/chat/completions +# +# ------------------------------------------------------------------------------ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- ../../components/istio-control-plane/ +- ../../components/vllm-sim/ +- ../../components/inference-gateway/ +- httproutes.yaml diff --git a/scripts/run-kind.sh b/scripts/run-kind.sh new file mode 100755 index 000000000..a8d86c9e8 --- /dev/null +++ b/scripts/run-kind.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# This shell script deploys a kind cluster with an Istio-based Gateway API +# implementation fully configured. It deploys the vllm simulator, which it +# exposes with a Gateway and HTTPRoute. The Gateway is configured with the +# a filter for the ext_proc endpoint picker. + +set -eo pipefail + +# ------------------------------------------------------------------------------ +# Variables +# ------------------------------------------------------------------------------ + +# Set a default CLUSTER_NAME if not provided +: "${CLUSTER_NAME:=inference-gateway}" + +# Set a default VLLM_SIMULATOR_VERSION if not provided +: "${VLLM_SIMULATOR_VERSION:=0.0.2}" + +# Set a default ENDPOINT_PICKER_VERSION if not provided +: "${ENDPOINT_PICKER_VERSION:=0.0.1}" + +# ------------------------------------------------------------------------------ +# Setup & Requirement Checks +# ------------------------------------------------------------------------------ + +# Check for a supported container runtime if an explicit one was not set +if [ -z "${CONTAINER_RUNTIME}" ]; then + if command -v docker &> /dev/null; then + CONTAINER_RUNTIME="docker" + elif command -v podman &> /dev/null; then + CONTAINER_RUNTIME="podman" + else + echo "Neither docker nor podman could be found in PATH" >&2 + exit 1 + fi +fi + +set -u + +# Check for required programs +for cmd in kind kubectl ${CONTAINER_RUNTIME}; do + if ! command -v "$cmd" &> /dev/null; then + echo "Error: $cmd is not installed or not in the PATH." + exit 1 + fi +done + +# ------------------------------------------------------------------------------ +# Cluster Deployment +# ------------------------------------------------------------------------------ + +# Check if the cluster already exists +if kind get clusters 2>/dev/null | grep -q "^${CLUSTER_NAME}$"; then + echo "Cluster '${CLUSTER_NAME}' already exists, re-using" +else + kind create cluster --name "${CLUSTER_NAME}" +fi + +# Set the kubectl context to the kind cluster +KUBE_CONTEXT="kind-${CLUSTER_NAME}" + +set -x + +# Hotfix for https://github.com/kubernetes-sigs/kind/issues/3880 +CONTAINER_NAME="${CLUSTER_NAME}-control-plane" +${CONTAINER_RUNTIME} exec -it ${CONTAINER_NAME} /bin/bash -c "sysctl net.ipv4.conf.all.arp_ignore=0" + +# Wait for all pods to be ready +kubectl --context ${KUBE_CONTEXT} -n kube-system wait --for=condition=Ready --all pods --timeout=300s +kubectl --context ${KUBE_CONTEXT} -n local-path-storage wait --for=condition=Ready --all pods --timeout=300s + +# Load the vllm simulator image into the cluster +if [ "${CONTAINER_RUNTIME}" == "podman" ]; then + podman tag localhost/vllm-sim/vllm-sim:${VLLM_SIMULATOR_VERSION} docker.io/vllm-sim/vllm-sim:${VLLM_SIMULATOR_VERSION} + podman save docker.io/vllm-sim/vllm-sim:${VLLM_SIMULATOR_VERSION} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin +else + kind --name ${CLUSTER_NAME} load docker-image vllm-sim/vllm-sim:${VLLM_SIMULATOR_VERSION} +fi + +# Load the ext_proc endpoint-picker image into the cluster +if [ "${CONTAINER_RUNTIME}" == "podman" ]; then + podman tag localhost/inference-router/router-ext-proc:${ENDPOINT_PICKER_VERSION} docker.io/inference-router/router-ext-proc:${ENDPOINT_PICKER_VERSION} + podman save docker.io/inference-router/router-ext-proc:${ENDPOINT_PICKER_VERSION} -o /dev/stdout | kind --name ${CLUSTER_NAME} load image-archive /dev/stdin +else + kind --name ${CLUSTER_NAME} load docker-image inference-router/router-ext-proc:${ENDPOINT_PICKER_VERSION} +fi + +# ------------------------------------------------------------------------------ +# Sail Operator Deployment +# ------------------------------------------------------------------------------ + +# Deploy the Sail Operator +kubectl kustomize --enable-helm deploy/components/sail-operator | + kubectl --context ${KUBE_CONTEXT} apply --server-side --force-conflicts -f - + +# Wait for the Sail Operator to be ready +kubectl --context ${KUBE_CONTEXT} -n sail-operator wait deployment/sail-operator --for=condition=Available --timeout=60s + +# ------------------------------------------------------------------------------ +# Development Environment +# ------------------------------------------------------------------------------ + +# Deploy the environment to the "default" namespace +kubectl kustomize deploy/environments/kind | sed 's/REPLACE_NAMESPACE/default/gI' \ + | kubectl --context ${KUBE_CONTEXT} apply -f - + +# Wait for all pods to be ready +kubectl --context ${KUBE_CONTEXT} wait --for=condition=Ready --all pods --timeout=300s + +# Wait for the gateway to be ready +kubectl --context ${KUBE_CONTEXT} wait gateway/inference-gateway --for=condition=Programmed --timeout=60s + +cat <