Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions config/charts/inferencepool/templates/epp-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "gateway-api-inference-extension.name" . }}
namespace: {{ .Release.Namespace }}
data:
default-plugins.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: low-queue-filter
parameters:
threshold: 128
- type: lora-affinity-filter
parameters:
threshold: 0.999
- type: least-queue-filter
- type: least-kv-cache-filter
- type: decision-tree-filter
name: low-latency-filter
parameters:
current:
pluginRef: low-queue-filter
nextOnSuccess:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
nextOnFailure:
decisionTree:
current:
pluginRef: least-queue-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: lora-affinity-filter
nextOnSuccessOrFailure:
decisionTree:
current:
pluginRef: least-kv-cache-filter
- type: random-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: low-latency-filter
- pluginRef: random-picker
plugins-v2.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: queue-scorer
- type: kv-cache-scorer
- type: prefix-cache-scorer
parameters:
hashBlockSize: 64
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: max-score-picker
parameters:
maxNumOfEndpoints: 1
- type: single-profile-handler
schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
weight: 1
- pluginRef: kv-cache-scorer
weight: 1
- pluginRef: prefix-cache-scorer
weight: 1
- pluginRef: max-score-picker
{{- if (hasKey .Values.inferenceExtension "additionalConfigs") }}
{{- .Values.inferenceExtension.additionalConfigs | toYaml | nindent 2 }}
{{- end }}

9 changes: 9 additions & 0 deletions config/charts/inferencepool/templates/epp-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ spec:
- "9003"
- -metricsPort
- "9090"
- -configFile
- {{ .Values.inferenceExtension.configFile }}
# https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
- "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
{{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
Expand Down Expand Up @@ -69,3 +71,10 @@ spec:
- name: {{ $key }}
value: {{ $value | quote }}
{{- end }}
volumeMounts:
- name: plugins-config-volume
mountPath: "/config"
volumes:
- name: plugins-config-volume
configMap:
name: {{ include "gateway-api-inference-extension.name" . }}
17 changes: 17 additions & 0 deletions config/charts/inferencepool/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@ inferenceExtension:
extProcPort: 9002
env: {}
enablePprof: true # Enable pprof handlers for profiling and debugging
configFile: "/config/default-plugins.yaml"
# additionalConfigs:
# custom-config.yaml: |
# apiVersion: inference.networking.x-k8s.io/v1alpha1
# kind: EndpointPickerConfig
# plugins:
# - type: custom-scorer
# parameters:
# custom-threshold: 64
# - type: max-score-picker
# - type: single-profile-handler
# schedulingProfiles:
# - name: default
# plugins:
# - pluginRef: custom-scorer
# - pluginRef: max-score-picker

# Example environment variables:
# env:
# KV_CACHE_SCORE_WEIGHT: "1"
Expand Down