neuralmagic · clubanderson · Apr 17, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/deploy/common/patch-service.yaml → ...penshift-router/common/patch-service.yaml b/deploy/common/patch-service.yaml → ...penshift-router/common/patch-service.yaml
diff --git a/deploy/common/patch-statefulset.yaml → ...hift-router/common/patch-statefulset.yaml b/deploy/common/patch-statefulset.yaml → ...hift-router/common/patch-statefulset.yaml
diff --git a/deploy/common/service.yaml → ...xtra/openshift-router/common/service.yaml b/deploy/common/service.yaml → ...xtra/openshift-router/common/service.yaml
diff --git a/deploy/common/statefulset.yaml → .../openshift-router/common/statefulset.yaml b/deploy/common/statefulset.yaml → .../openshift-router/common/statefulset.yaml
diff --git a/deploy/kustomization.yaml → ...extra/openshift-router/kustomization.yaml b/deploy/kustomization.yaml → ...extra/openshift-router/kustomization.yaml
diff --git a/deploy/openshift/patch-route.yaml → ...enshift-router/openshift/patch-route.yaml b/deploy/openshift/patch-route.yaml → ...enshift-router/openshift/patch-route.yaml
diff --git a/deploy/openshift/route.yaml → ...tra/openshift-router/openshift/route.yaml b/deploy/openshift/route.yaml → ...tra/openshift-router/openshift/route.yaml
diff --git a/deploy/rbac/exec-rbac-role.yaml → ...openshift-router/rbac/exec-rbac-role.yaml b/deploy/rbac/exec-rbac-role.yaml → ...openshift-router/rbac/exec-rbac-role.yaml
diff --git a/deploy/rbac/exec-rbac-rolebinding.yaml → ...ft-router/rbac/exec-rbac-rolebinding.yaml b/deploy/rbac/exec-rbac-rolebinding.yaml → ...ft-router/rbac/exec-rbac-rolebinding.yaml
diff --git a/deploy/rbac/patch-rbac-role.yaml → ...penshift-router/rbac/patch-rbac-role.yaml b/deploy/rbac/patch-rbac-role.yaml → ...penshift-router/rbac/patch-rbac-role.yaml
diff --git a/deploy/rbac/patch-rbac-rolebinding.yaml → ...t-router/rbac/patch-rbac-rolebinding.yaml b/deploy/rbac/patch-rbac-rolebinding.yaml → ...t-router/rbac/patch-rbac-rolebinding.yaml
diff --git a/deploy/components/inference-gateway/configmaps.yaml b/deploy/components/inference-gateway/configmaps.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: endpoint-picker-config
+data:
+  config.yaml: |
+    pod_selector:
+      ai-aware-router-pod: true
+    routing_filters:
+    routing_scorers:
+    - name: session-affinity
+      weight: 60
+    - name: route-by-active-lora
+      weight: 50
+    routing_header: x-ai-aware-router-routing
+    session_id_header: x-ai-aware-router-session-id
+    listening_port: 9080
+    inference_port: 8000
diff --git a/deploy/components/inference-gateway/deployments.yaml b/deploy/components/inference-gateway/deployments.yaml
@@ -0,0 +1,32 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: endpoint-picker
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: endpoint-picker
+  template:
+    metadata:
+      labels:
+        app: endpoint-picker
+    spec:
+      serviceAccountName: endpoint-picker
+      containers:
+        - name: endpoint-picker
+          image: inference-router/router-ext-proc:latest
+          args:
+            - "--config-file"
+            - "/etc/endpoint-picker/config.yaml"
+          ports:
+            - name: grpc
+              containerPort: 9080
+              protocol: TCP
+          volumeMounts:
+            - name: endpoint-picker-config
+              mountPath: /etc/endpoint-picker
+      volumes:
+        - name: endpoint-picker-config
+          configMap:
+            name: endpoint-picker-config
diff --git a/deploy/components/inference-gateway/envoy-filters.yaml b/deploy/components/inference-gateway/envoy-filters.yaml
@@ -0,0 +1,31 @@
+apiVersion: networking.istio.io/v1alpha3
+kind: EnvoyFilter
+metadata:
+  name: endpoint-picker
+spec:
+  configPatches:
+  - applyTo: HTTP_FILTER
+    match:
+      listener:
+        filterChain:
+          filter:
+            name: "envoy.filters.network.http_connection_manager"
+    patch:
+      operation: INSERT_FIRST
+      value:
+        name: envoy.filters.http.ext_proc
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+          failure_mode_allow: false
+          allow_mode_override: true
+          processing_mode:
+            request_header_mode: "SEND"
+            response_header_mode: "SEND"
+            request_body_mode: "BUFFERED"
+            response_body_mode: "BUFFERED"
+            request_trailer_mode: "SEND"
+            response_trailer_mode: "SKIP"
+          grpc_service:
+            envoy_grpc:
+              cluster_name: outbound|9080||endpoint-picker.REPLACE_NAMESPACE.svc.cluster.local
+            timeout: 5s
diff --git a/deploy/components/inference-gateway/gateways.yaml b/deploy/components/inference-gateway/gateways.yaml
@@ -0,0 +1,14 @@
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: inference-gateway
+  labels:
+    istio.io/rev: istio-control-plane
+  annotations:
+    networking.istio.io/service-type: ClusterIP
+spec:
+  gatewayClassName: istio
+  listeners:
+  - name: default
+    port: 80
+    protocol: HTTP
diff --git a/deploy/components/inference-gateway/kustomization.yaml b/deploy/components/inference-gateway/kustomization.yaml
@@ -0,0 +1,30 @@
+# ------------------------------------------------------------------------------
+# Inference Gateway
+#
+# This deploys a Gateway and the Endpoint Picker (EPP), and attaches the EPP to
+# the Gateway with an EnvoyFilter.
+#
+# Add an HTTPRoute to route traffic to VLLM, or a VLLM simulator.
+#
+# **WARNING**: The EnvoyFilter contains a variable that needs to be replaced
+# with the namespace to match the EPP's Service. For now use sed to replace it,
+# e.g.:
+#
+#   $ kubectl kustomize deploy/components/inference-gateway \
+#     | sed 's/REPLACE_NAMESPACE/mynamespace/gI' \
+#     | kubectl -n mynamespace apply -f -
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- configmaps.yaml
+- deployments.yaml
+- services.yaml
+- rbac.yaml
+- gateways.yaml
+- envoy-filters.yaml
+
+images:
+- name: inference-router/router-ext-proc
+  newTag: 0.0.1
diff --git a/deploy/components/inference-gateway/rbac.yaml b/deploy/components/inference-gateway/rbac.yaml
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: endpoint-picker
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: endpoint-picker
+rules:
+  - apiGroups:
+    - ""
+    resources:
+    - "pods"
+    verbs:
+    - "get"
+    - "list"
+    - "watch"
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: endpoint-picker-binding
+subjects:
+  - kind: ServiceAccount
+    name: endpoint-picker
+roleRef:
+  kind: Role
+  name: endpoint-picker
+  apiGroup: rbac.authorization.k8s.io
+
diff --git a/deploy/components/inference-gateway/services.yaml b/deploy/components/inference-gateway/services.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: endpoint-picker
+spec:
+  type: ClusterIP
+  selector:
+    app: endpoint-picker
+  ports:
+    - name: grpc
+      protocol: TCP
+      port: 9080
+      targetPort: 9080
diff --git a/deploy/components/istio-control-plane/control-plane.yaml b/deploy/components/istio-control-plane/control-plane.yaml
@@ -0,0 +1,13 @@
+apiVersion: sailoperator.io/v1
+kind: Istio
+metadata:
+  name: control-plane
+spec:
+  version: v1.25-latest
+  values:
+    pilot:
+      resources:
+        requests:
+          cpu: 100m
+          memory: 1024Mi
+
diff --git a/deploy/components/istio-control-plane/kustomization.yaml b/deploy/components/istio-control-plane/kustomization.yaml
@@ -0,0 +1,15 @@
+# ------------------------------------------------------------------------------
+# Istio Control Plane
+#
+# This deploys an Istio control-plane for the entire cluster. This enables the
+# creation of Gateways.
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: istio-system
+namePrefix: istio-
+
+resources:
+- namespaces.yaml
+- control-plane.yaml
diff --git a/deploy/components/istio-control-plane/namespaces.yaml b/deploy/components/istio-control-plane/namespaces.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: system
diff --git a/deploy/components/sail-operator/.gitignore b/deploy/components/sail-operator/.gitignore
@@ -0,0 +1 @@
+charts/
diff --git a/deploy/components/sail-operator/kustomization.yaml b/deploy/components/sail-operator/kustomization.yaml
@@ -0,0 +1,32 @@
+# ------------------------------------------------------------------------------
+# Istio Sail Operator
+#
+# This deploys the Istio Sail Operator via Helm chart to enable the creation
+# of Istio Control Planes, and ultimately Gateways. This will also deploy all
+# the Istio and Gateway API CRDs.
+#
+# This is required on Kubernetes clusters, and OpenShift clusters versions
+# below 4.19 (OpenShift 4.19+ includes all this by default).
+#
+# **Warning**: This needs to be deployed before, and separately from other
+# components as it deploys CRDs. It can be deployed with:
+#
+#   $ kubectl kustomize --enable-helm deploy/components/sail-operator/ \
+#     | kubectl apply --server-side --force-conflicts -f -
+#
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: sail-operator
+
+resources:
+- https://github.com/kubernetes-sigs/gateway-api/config/crd?ref=v1.2.1
+- namespaces.yaml
+
+helmCharts:
+- name: sail-operator
+  namespace: sail-operator
+  repo: https://istio-ecosystem.github.io/sail-operator
+  version: 1.25.1
+  includeCRDs: true
diff --git a/deploy/components/sail-operator/namespaces.yaml b/deploy/components/sail-operator/namespaces.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: sail-operator
diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml
@@ -0,0 +1,86 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-30801
+  labels:
+    app: vllm-30801
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-30801
+  template:
+    metadata:
+      labels:
+        app: vllm-30801
+        ai-aware-router-pod: "true"
+      annotations:
+        ai-aware-router-address: 127.0.0.1:30801
+    spec:
+      containers:
+      - name: vllm
+        image: vllm-sim/vllm-sim:latest
+        args:
+        - "--port=30801"
+        - "--model=model1"
+        - "--lora=lora1,lora2"
+        ports:
+          - containerPort: 30801
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-30802
+  labels:
+    app: vllm-30802
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-30802
+  template:
+    metadata:
+      labels:
+        app: vllm-30802
+        ai-aware-router-pod: "true"
+      annotations:
+        ai-aware-router-address: 127.0.0.1:30802
+    spec:
+      containers:
+      - name: vllm
+        image: vllm-sim/vllm-sim:latest
+        args:
+        - "--port=30802"
+        - "--model=model1"
+        - "--lora=lora1,lora2"
+        ports:
+          - containerPort: 30802
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-30803
+  labels:
+    app: vllm-30803
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-30803
+  template:
+    metadata:
+      labels:
+        app: vllm-30803
+        ai-aware-router-pod: "true"
+      annotations:
+        ai-aware-router-address: 127.0.0.1:30803
+    spec:
+      containers:
+      - name: vllm
+        image: vllm-sim/vllm-sim:latest
+        args:
+        - "--port=30803"
+        - "--model=model2"
+        - "--lora=lora3"
+        ports:
+          - containerPort: 30803
diff --git a/deploy/components/vllm-sim/kustomization.yaml b/deploy/components/vllm-sim/kustomization.yaml
@@ -0,0 +1,17 @@
+# ------------------------------------------------------------------------------
+# VLLM Simulator
+#
+# This deploys a VLLM simulator which can be used to simulate inference for
+# small environments (e.g. Kubernetes In Docker (KIND) clusters) or for simple
+# tests.
+# ------------------------------------------------------------------------------
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+- deployments.yaml
+- services.yaml
+
+images:
+- name: vllm-sim/vllm-sim
+  newTag: 0.0.2
diff --git a/deploy/components/vllm-sim/services.yaml b/deploy/components/vllm-sim/services.yaml
@@ -0,0 +1,38 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: vllm-30801
+spec:
+  type: ClusterIP
+  selector:
+    app: vllm-30801
+  ports:
+  - protocol: TCP
+    port: 30801
+    targetPort: 30801
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: vllm-30802
+spec:
+  type: ClusterIP
+  selector:
+    app: vllm-30802
+  ports:
+  - protocol: TCP
+    port: 30802
+    targetPort: 30802
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: vllm-30803
+spec:
+  type: ClusterIP
+  selector:
+    app: vllm-30803
+  ports:
+  - protocol: TCP
+    port: 30803
+    targetPort: 30803