Remove redundant SheddableCapacityFilter.

LukeAVanDrie · LukeAVanDrie · commit 23a3787d04f1 · 2025-06-02T21:44:48.000Z
Admission control/capacity management is now handled in `requestcontrol.Director.PreDispatch` (and soon to be absorbed into the new Flow Controller). This should no longer be a responsibility of the scheduling framework and this check is already being applied in kubernetes-sigs#805 prior to the scheduling layer being invoked. This is not a no-op change. Previously, the `SheddableCapacityFilter`, in addition to dropping sheddable requests when at capacity, would also strictly filter the pods that the rest of the scheduling plugins would consider as input. This change removes that strict filtering so all pods are now considered so long as the system is not considered saturated. This means sheddable requests now follow the same scheduling path as critical requests provided they are not dropped by the saturation detection check in `PreDispatch`.
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
@@ -43,7 +43,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
 	profilepicker "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile-picker"
@@ -208,7 +207,6 @@ func run() error {
 		kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog)
 
 		schedulerProfile := framework.NewSchedulerProfile().
-			WithFilters(filter.NewSheddableCapacityFilter()).
 			WithScorers(framework.NewWeightedScorer(&scorer.QueueScorer{}, queueScorerWeight),
 				framework.NewWeightedScorer(&scorer.KVCacheScorer{}, kvCacheScorerWeight)).
 			WithPicker(picker.NewMaxScorePicker())
diff --git a/pkg/epp/scheduling/framework/plugins/filter/filter_test.go b/pkg/epp/scheduling/framework/plugins/filter/filter_test.go
@@ -134,42 +134,6 @@ func TestFilter(t *testing.T) {
 				},
 			},
 		},
-		{
-			name:   "SheddableCapacityFilter, sheddable request",
-			req:    &types.LLMRequest{Critical: false},
-			filter: &SheddableCapacityFilter{queueThreshold: 0, kvCacheThreshold: 0.8},
-			input: []types.Pod{
-				&types.PodMetrics{
-					// This pod should be returned.
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0,
-					},
-				},
-				&types.PodMetrics{
-					// Queue is non zero, despite low kv cache, should not return.
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    1,
-						KVCacheUsagePercent: 0.3,
-					},
-				},
-				&types.PodMetrics{
-					// High kv cache despite zero queue, should not return
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 1.0,
-					},
-				},
-			},
-			output: []types.Pod{
-				&types.PodMetrics{
-					MetricsState: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0,
-					},
-				},
-			},
-		},
 	}
 
 	for _, test := range tests {
@@ -241,7 +205,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	// initialize LoraAffinityFilter
 	LoraAffinityFilter := NewLoraAffinityFilter()
 
-	for i := 0; i < numIterations; i++ {
+	for range numIterations {
 		result := LoraAffinityFilter.Filter(context.Background(), req, types.NewCycleState(), pods)
 
 		// Check which type of pod was returned
diff --git a/pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go b/pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go
diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go
@@ -65,7 +65,7 @@ func NewScheduler(datastore Datastore) *Scheduler {
 	}
 
 	defaultProfile := framework.NewSchedulerProfile().
-		WithFilters(filter.NewSheddableCapacityFilter(), lowLatencyFilter).
+		WithFilters(lowLatencyFilter).
 		WithPicker(&picker.RandomPicker{})
 
 	profilePicker := profilepicker.NewAllProfilesPicker()
diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go
@@ -51,7 +51,7 @@ func TestSchedule(t *testing.T) {
 			err:     true,
 		},
 		{
-			name: "critical request",
+			name: "finds optimal pod",
 			req: &types.LLMRequest{
 				TargetModel: "critical",
 				RequestId:   uuid.NewString(),
@@ -116,120 +116,6 @@ func TestSchedule(t *testing.T) {
 				},
 			},
 		},
-		{
-			name: "sheddable request, accepted",
-			req: &types.LLMRequest{
-				TargetModel: "sheddable",
-				RequestId:   uuid.NewString(),
-				Critical:    false,
-			},
-			// pod1 will be picked because it has capacity for the sheddable request.
-			input: []*backendmetrics.FakePodMetrics{
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    0,
-						KVCacheUsagePercent: 0.2,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-							"bar": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    3,
-						KVCacheUsagePercent: 0.1,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo":      1,
-							"critical": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.2,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-						},
-					},
-				},
-			},
-			wantRes: map[string]*types.Result{
-				"default": {
-					TargetPod: &types.ScoredPod{
-						Pod: &types.PodMetrics{
-							Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}, Labels: make(map[string]string)},
-							MetricsState: &backendmetrics.MetricsState{
-								WaitingQueueSize:    0,
-								KVCacheUsagePercent: 0.2,
-								MaxActiveModels:     2,
-								ActiveModels: map[string]int{
-									"foo": 1,
-									"bar": 1,
-								},
-								WaitingModels: map[string]int{},
-							},
-						},
-					},
-				},
-			},
-		},
-		{
-			name: "sheddable request, dropped",
-			req: &types.LLMRequest{
-				TargetModel: "sheddable",
-				RequestId:   uuid.NewString(),
-				Critical:    false,
-			},
-			// All pods have higher KV cache thant the threshold, so the sheddable request will be
-			// dropped.
-			input: []*backendmetrics.FakePodMetrics{
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.9,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-							"bar": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    3,
-						KVCacheUsagePercent: 0.85,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo":      1,
-							"critical": 1,
-						},
-					},
-				},
-				{
-					Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}},
-					Metrics: &backendmetrics.MetricsState{
-						WaitingQueueSize:    10,
-						KVCacheUsagePercent: 0.85,
-						MaxActiveModels:     2,
-						ActiveModels: map[string]int{
-							"foo": 1,
-						},
-					},
-				},
-			},
-			wantRes: nil,
-			err:     true,
-		},
 	}
 
 	for _, test := range tests {

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ func NewScheduler(datastore Datastore) *Scheduler {`
`65`	`65`	`}`
`66`	`66`
`67`	`67`	`defaultProfile := framework.NewSchedulerProfile().`
`68`		`- WithFilters(filter.NewSheddableCapacityFilter(), lowLatencyFilter).`
	`68`	`+ WithFilters(lowLatencyFilter).`
`69`	`69`	`WithPicker(&picker.RandomPicker{})`
`70`	`70`
`71`	`71`	`profilePicker := profilepicker.NewAllProfilesPicker()`