kubernetes-sigs · k8s-ci-robot · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml
@@ -1,6 +1,8 @@
-# Note: If you change this file, please also change the file used for e2e tests!
-# 
-# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
+# Note: If you change this file, please also change:
+#  - ./test/testdata/inferencepool-e2e.yaml
+#  - ./conformance/resources/manifests/manifests.yaml
+#  - ./site-src/guides/inferencepool-rollout.md
+---
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
 metadata:

diff --git a/conformance/resources/manifests/manifests.yaml b/conformance/resources/manifests/manifests.yaml
@@ -342,7 +342,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: plugins-config
-  namespace: default
+  namespace: gateway-conformance-app-backend
 data:
   conformance-plugins.yaml: |
     apiVersion: inference.networking.x-k8s.io/v1alpha1

diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md
@@ -177,7 +177,6 @@ spec:
       terminationGracePeriodSeconds: 130
       nodeSelector:
         cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
-
       volumes:
         - name: data
           emptyDir: {}
@@ -250,40 +249,133 @@ spec:
     spec:
       terminationGracePeriodSeconds: 130
       containers:
-        - name: epp
-          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-          imagePullPolicy: Always
-          args:
-            - -poolName
-            - "vllm-llama3-8b-instruct-new"
-            - "-poolNamespace"
-            - "default"
-            - -v
-            - "4"
-            - --zap-encoder
-            - "json"
-            - -grpcPort
-            - "9002"
-            - -grpcHealthPort
-            - "9003"
-          ports:
-            - containerPort: 9002
-            - containerPort: 9003
-            - name: metrics
-              containerPort: 9090
-          livenessProbe:
-            grpc:
-              port: 9003
-              service: inference-extension
-            initialDelaySeconds: 5
-            periodSeconds: 10
-          readinessProbe:
-            grpc:
-              port: 9003
-              service: inference-extension
-            initialDelaySeconds: 5
-            periodSeconds: 10
-  EOF
+      - name: epp
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+        imagePullPolicy: Always
+        args:
+        - -poolName
+        - "vllm-llama3-8b-instruct-new"
+        - -poolNamespace
+        - "default"
+        - -v
+        - "4"
+        - --zap-encoder
+        - "json"
+        - -grpcPort
+        - "9002"
+        - -grpcHealthPort
+        - "9003"
+        - -configFile
+        - "/config/conformance-plugins.yaml"
+        ports:
+        - containerPort: 9002
+          name: grpc
+        - containerPort: 9003
+          name: grpc-health
+        - containerPort: 9090
+          name: metrics
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: inference-extension
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        volumeMounts:
+        - name: plugins-config-volume
+          mountPath: /config
+      volumes:
+      - name: plugins-config-volume
+        configMap:
+          name: plugins-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: plugins-config
+  namespace: default
+data:
+  default-plugins.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+    - type: low-queue-filter
+      parameters:
+        threshold: 128
+    - type: lora-affinity-filter
+      parameters:
+        threshold: 0.999
+    - type: least-queue-filter
+    - type: least-kv-cache-filter
+    - type: decision-tree-filter
+      name: low-latency-filter
+      parameters:
+        current:
+          pluginRef: low-queue-filter
+        nextOnSuccess:
+          decisionTree:
+            current:
+              pluginRef: lora-affinity-filter
+            nextOnSuccessOrFailure:
+              decisionTree:
+                current:
+                  pluginRef: least-queue-filter
+                nextOnSuccessOrFailure:
+                  decisionTree:
+                    current:
+                      pluginRef: least-kv-cache-filter
+        nextOnFailure:
+          decisionTree:
+            current:
+              pluginRef: least-queue-filter
+            nextOnSuccessOrFailure:
+              decisionTree:
+                current:
+                  pluginRef: lora-affinity-filter
+                nextOnSuccessOrFailure:
+                  decisionTree:
+                    current:
+                      pluginRef: least-kv-cache-filter
+    - type: random-picker
+      parameters:
+        maxNumOfEndpoints: 1
+    - type: single-profile-handler
+    schedulingProfiles:
+    - name: default
+      plugins:
+      - pluginRef: low-latency-filter
+      - pluginRef: random-picker
+  plugins-v2.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+    - type: queue-scorer
+    - type: kv-cache-scorer
+    - type: prefix-cache-scorer
+      parameters:
+        hashBlockSize: 64
+        maxPrefixBlocksToMatch: 256
+        lruCapacityPerServer: 31250
+    - type: max-score-picker
+      parameters:
+        maxNumOfEndpoints: 1
+    - type: single-profile-handler
+    schedulingProfiles:
+    - name: default
+      plugins:
+      - pluginRef: queue-scorer
+        weight: 1
+      - pluginRef: kv-cache-scorer
+        weight: 1
+      - pluginRef: prefix-cache-scorer
+        weight: 1
+      - pluginRef: max-score-picker
+EOF
 ```
 
 ### Direct traffic to the new inference pool