Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions config/manifests/inferencepool-resources.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Note: If you change this file, please also change the file used for e2e tests!
#
# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
# Note: If you change this file, please also change:
# - ./test/testdata/inferencepool-e2e.yaml
# - ./conformance/resources/manifests/manifests.yaml
# - ./site-src/guides/inferencepool-rollout.md
---
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
Expand Down
2 changes: 1 addition & 1 deletion conformance/resources/manifests/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: plugins-config
namespace: default
namespace: gateway-conformance-app-backend
data:
conformance-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
Expand Down
162 changes: 127 additions & 35 deletions site-src/guides/inferencepool-rollout.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ spec:
terminationGracePeriodSeconds: 130
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-h100-80gb"

volumes:
- name: data
emptyDir: {}
Expand Down Expand Up @@ -250,40 +249,133 @@ spec:
spec:
terminationGracePeriodSeconds: 130
containers:
- name: epp
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
imagePullPolicy: Always
args:
- -poolName
- "vllm-llama3-8b-instruct-new"
- "-poolNamespace"
- "default"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- "9002"
- -grpcHealthPort
- "9003"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
EOF
- name: epp
image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
imagePullPolicy: Always
args:
- -poolName
- "vllm-llama3-8b-instruct-new"
- -poolNamespace
- "default"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- "9002"
- -grpcHealthPort
- "9003"
- -configFile
- "/config/conformance-plugins.yaml"
ports:
- containerPort: 9002
name: grpc
- containerPort: 9003
name: grpc-health
- containerPort: 9090
name: metrics
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
volumeMounts:
- name: plugins-config-volume
mountPath: /config
volumes:
- name: plugins-config-volume
configMap:
name: plugins-config
---
apiVersion: v1
kind: ConfigMap
metadata:
name: plugins-config
namespace: default
data:
default-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: low-queue-filter
parameters:
threshold: 128
- type: lora-affinity-filter
parameters:
threshold: 0.999
- type: least-queue-filter
- type: least-kv-cache-filter
- type: decision-tree-filter
name: low-latency-filter
parameters:
current:
pluginRef: low-queue-filter
nextOnSuccess:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
nextOnFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
- type: random-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: low-latency-filter
- pluginRef: random-picker
plugins-v2.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-scorer
- type: prefix-cache-scorer
parameters:
hashBlockSize: 64
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: max-score-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 1
- pluginRef: kv-cache-scorer
weight: 1
- pluginRef: prefix-cache-scorer
weight: 1
- pluginRef: max-score-picker
EOF
```

### Direct traffic to the new inference pool
Expand Down