kubernetes-sigs
diff --git a/‎pkg/epp/scheduling/framework/plugins.go‎
Lines changed: 7 additions & 5 deletions b/‎pkg/epp/scheduling/framework/plugins.go‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/decision_tree_filter.go‎
Lines changed: 8 additions & 5 deletions b/‎pkg/epp/scheduling/framework/plugins/filter/decision_tree_filter.go‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/filter_test.go‎
Lines changed: 3 additions & 6 deletions b/‎pkg/epp/scheduling/framework/plugins/filter/filter_test.go‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/least_kvcache_filter.go‎
Lines changed: 2 additions & 1 deletion b/‎pkg/epp/scheduling/framework/plugins/filter/least_kvcache_filter.go‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/least_queue_filter.go‎
Lines changed: 2 additions & 1 deletion b/‎pkg/epp/scheduling/framework/plugins/filter/least_queue_filter.go‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/lora_affinity_filter.go‎
Lines changed: 4 additions & 3 deletions b/‎pkg/epp/scheduling/framework/plugins/filter/lora_affinity_filter.go‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/low_queue_filter.go‎
Lines changed: 3 additions & 1 deletion b/‎pkg/epp/scheduling/framework/plugins/filter/low_queue_filter.go‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go‎
Lines changed: 4 additions & 2 deletions b/‎pkg/epp/scheduling/framework/plugins/filter/sheddable_capacity_filter.go‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go‎
Lines changed: 30 additions & 25 deletions b/‎pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go‎
Lines changed: 30 additions & 25 deletions
@@ -17,6 +17,8 @@ limitations under the License.
 package framework
 
 import (
+	"context"
+
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 )
 
@@ -46,31 +48,31 @@ type ProfilePicker interface {
 // Filter defines the interface for filtering a list of pods based on context.
 type Filter interface {
 	Plugin
-	Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod
+	Filter(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) []types.Pod
 }
 
 // Scorer defines the interface for scoring a list of pods based on context.
 // Scorers must score pods with a value within the range of [0,1] where 1 is the highest score.
 type Scorer interface {
 	Plugin
-	Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64
+	Score(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64
 }
 
 // Picker picks the final pod(s) to send the request to.
 type Picker interface {
 	Plugin
-	Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result
+	Pick(ctx context.Context, cycleState *types.CycleState, scoredPods []*types.ScoredPod) *types.Result
 }
 
 // PostCycle is called by the scheduler after it selects a targetPod for the request in the SchedulerProfile cycle.
 type PostCycle interface {
 	Plugin
-	PostCycle(ctx *types.SchedulingContext, res *types.Result)
+	PostCycle(ctx context.Context, cycleState *types.CycleState, res *types.Result)
 }
 
 // PostResponse is called by the scheduler after a successful response was sent.
 // The given pod argument is the pod that served the request.
 type PostResponse interface {
 	Plugin
-	PostResponse(ctx *types.SchedulingContext, pod types.Pod)
+	PostResponse(ctx context.Context, response *types.LLMResponse, targetPod types.Pod)
 }
@@ -17,6 +17,9 @@ limitations under the License.
 package filter
 
 import (
+	"context"
+
+	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
@@ -53,9 +56,9 @@ func (f *DecisionTreeFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *DecisionTreeFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
-	loggerTrace := ctx.Logger.V(logutil.TRACE)
-	filteredPod := f.Current.Filter(ctx, pods)
+func (f *DecisionTreeFilter) Filter(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) []types.Pod {
+	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
+	filteredPod := f.Current.Filter(ctx, request, cycleState, pods)
 
 	next := f.NextOnSuccessOrFailure
 	if len(filteredPod) > 0 {
@@ -68,7 +71,7 @@ func (f *DecisionTreeFilter) Filter(ctx *types.SchedulingContext, pods []types.P
 		}
 		loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filteredPod))
 		// On success, pass the filtered result to the next filter.
-		return next.Filter(ctx, filteredPod)
+		return next.Filter(ctx, request, cycleState, filteredPod)
 	} else {
 		if f.NextOnFailure == nil && f.NextOnSuccessOrFailure == nil {
 			// No succeeding filters to run, return.
@@ -79,6 +82,6 @@ func (f *DecisionTreeFilter) Filter(ctx *types.SchedulingContext, pods []types.P
 		}
 		loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name())
 		// On failure, pass the initial set of pods to the next filter.
-		return next.Filter(ctx, pods)
+		return next.Filter(ctx, request, cycleState, pods)
 	}
 }
@@ -39,7 +39,7 @@ func (f *filterAll) Name() string {
 	return "filter all"
 }
 
-func (f *filterAll) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+func (f *filterAll) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
 	return []types.Pod{}
 }
 
@@ -174,8 +174,7 @@ func TestFilter(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			ctx := types.NewSchedulingContext(context.Background(), test.req, nil, test.input)
-			got := test.filter.Filter(ctx, test.input)
+			got := test.filter.Filter(context.Background(), test.req, types.NewCycleState(), test.input)
 
 			if diff := cmp.Diff(test.output, got); diff != "" {
 				t.Errorf("Unexpected output (-want +got): %v", diff)
@@ -231,8 +230,6 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 			},
 		},
 	}
-	ctx := types.NewSchedulingContext(context.Background(), req, nil, pods)
-
 	// Run the filter function multiple times and count the results
 	affinityCount := 0
 	availableCount := 0
@@ -245,7 +242,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) {
 	LoraAffinityFilter := NewLoraAffinityFilter()
 
 	for i := 0; i < numIterations; i++ {
-		result := LoraAffinityFilter.Filter(ctx, pods)
+		result := LoraAffinityFilter.Filter(context.Background(), req, types.NewCycleState(), pods)
 
 		// Check which type of pod was returned
 		if len(result) != 1 {
 
@@ -17,6 +17,7 @@ limitations under the License.
 package filter
 
 import (
+	"context"
 	"math"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
@@ -44,7 +45,7 @@ func (f *LeastKVCacheFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *LeastKVCacheFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+func (f *LeastKVCacheFilter) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
 	filteredPods := []types.Pod{}
 
 	min := math.MaxFloat64
 
@@ -17,6 +17,7 @@ limitations under the License.
 package filter
 
 import (
+	"context"
 	"math"
 
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
@@ -44,7 +45,7 @@ func (f *LeastQueueFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *LeastQueueFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+func (f *LeastQueueFilter) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
 	filteredPods := []types.Pod{}
 
 	min := math.MaxInt
 
@@ -17,6 +17,7 @@ limitations under the License.
 package filter
 
 import (
+	"context"
 	"math/rand"
 	"time"
 
@@ -52,15 +53,15 @@ func (f *LoraAffinityFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *LoraAffinityFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+func (f *LoraAffinityFilter) Filter(_ context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
 	// Pre-allocate slices with estimated capacity
 	filtered_affinity := make([]types.Pod, 0, len(pods))
 	filtered_available := make([]types.Pod, 0, len(pods))
 
 	// Categorize pods based on affinity and availability
 	for _, pod := range pods {
-		_, active := pod.GetMetrics().ActiveModels[ctx.Req.TargetModel]
-		_, waiting := pod.GetMetrics().WaitingModels[ctx.Req.TargetModel]
+		_, active := pod.GetMetrics().ActiveModels[request.TargetModel]
+		_, waiting := pod.GetMetrics().WaitingModels[request.TargetModel]
 
 		if active || waiting {
 			filtered_affinity = append(filtered_affinity, pod)
 
@@ -17,6 +17,8 @@ limitations under the License.
 package filter
 
 import (
+	"context"
+
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
@@ -43,7 +45,7 @@ func (f *LowQueueFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *LowQueueFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
+func (f *LowQueueFilter) Filter(_ context.Context, _ *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
 	filteredPods := []types.Pod{}
 
 	for _, pod := range pods {
 
@@ -17,6 +17,8 @@ limitations under the License.
 package filter
 
 import (
+	"context"
+
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
@@ -45,8 +47,8 @@ func (f *SheddableCapacityFilter) Name() string {
 }
 
 // Filter filters out pods that doesn't meet the filter criteria.
-func (f *SheddableCapacityFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod {
-	if ctx.Req.Critical {
+func (f *SheddableCapacityFilter) Filter(_ context.Context, request *types.LLMRequest, _ *types.CycleState, pods []types.Pod) []types.Pod {
+	if request.Critical {
 		return pods // // Allow all pods to passthrough if the request is critical, even if all pods reach their capacity.
 	}
 
 
@@ -17,11 +17,13 @@ limitations under the License.
 package prefix
 
 import (
+	"context"
 	"encoding/binary"
 	"fmt"
 
 	"github.com/cespare/xxhash/v2"
 	k8stypes "k8s.io/apimachinery/pkg/types"
+	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
@@ -131,24 +133,11 @@ func (m *Plugin) Name() string {
 	return "prefix-cache"
 }
 
-// PostCycle records in the plugin cache the result of the scheduling selection.
-func (m *Plugin) PostCycle(ctx *types.SchedulingContext, res *types.Result) {
-	targetPod := res.TargetPod.GetPod()
-	state, err := m.getPrefixState(ctx.CycleState)
-	if err != nil {
-		ctx.Logger.Error(err, "failed to read prefix plugin cycle state")
-		return
-	}
-	m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName))
-	total := len(state.PrefixHashes)
-	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
-	metrics.RecordPrefixCacheMatch(matchLen*m.HashBlockSize, total*m.HashBlockSize)
-}
-
 // Score returns the scoring result for the given list of pods based on context.
-func (m *Plugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
+func (m *Plugin) Score(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64 {
+	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes := hashPrompt(ctx, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
+	hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
 	numServers := DefaultNumServersToMatch
 	if numServers > len(pods) {
 		numServers = len(pods)
@@ -157,8 +146,8 @@ func (m *Plugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types
 		PrefixHashes:       hashes,
 		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes, numServers),
 	}
-	ctx.CycleState.Write(types.StateKey(m.Name()), state)
-	ctx.Logger.V(logutil.TRACE).Info(fmt.Sprintf("cached servers: %+v", state.PrefixCacheServers), "hashes", state.PrefixHashes)
+	cycleState.Write(types.StateKey(m.Name()), state)
+	loggerTrace.Info(fmt.Sprintf("cached servers: %+v", state.PrefixCacheServers), "hashes", state.PrefixHashes)
 	// calculate the scores of pods
 	scores := make(map[types.Pod]float64, len(pods))
 
@@ -177,16 +166,31 @@ func (m *Plugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types
 	return scores
 }
 
+// PostCycle records in the plugin cache the result of the scheduling selection.
+func (m *Plugin) PostCycle(ctx context.Context, cycleState *types.CycleState, res *types.Result) {
+	targetPod := res.TargetPod.GetPod()
+	state, err := m.getPrefixState(cycleState)
+	if err != nil {
+		log.FromContext(ctx).Error(err, "failed to read prefix plugin cycle state")
+		return
+	}
+	m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName))
+	total := len(state.PrefixHashes)
+	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
+	metrics.RecordPrefixCacheMatch(matchLen*m.HashBlockSize, total*m.HashBlockSize)
+}
+
 // matchLongestPrefix returns a map of servers and length of prefix that each server caches.
-func (m *Plugin) matchLongestPrefix(ctx *types.SchedulingContext, hashes []BlockHash, numServers int) map[ServerID]int {
+func (m *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash, numServers int) map[ServerID]int {
+	loggerTrace := log.FromContext(ctx).V(logutil.TRACE)
 	res := make(map[ServerID]int)
 	// Use a greedy strategy to search from the longest prefix.
 	// NOTE: It's possible to further optimize this with a binary search.
 	for i := len(hashes) - 1; i >= 0 && len(res) < numServers; i-- {
 		hash := hashes[i]
 		cachedServers := m.indexer.Get(hash)
 		if len(cachedServers) > 0 {
-			ctx.Logger.V(logutil.TRACE).Info("Found cached servers", "cachedServers", cachedServers, "total # blocks", len(hashes), "longest prefix", i)
+			loggerTrace.Info("Found cached servers", "cachedServers", cachedServers, "total # blocks", len(hashes), "longest prefix", i)
 			for server := range cachedServers {
 				// Update servers with their longest prefix match.
 				// If we already found this server with longer prefix match, don't update it.
@@ -218,21 +222,22 @@ func (m *Plugin) getPrefixState(cycleState *types.CycleState) (*schedulingContex
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
 // hash(0) is the hash of the model name, since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
-func hashPrompt(ctx *types.SchedulingContext, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
-	prompt := []byte(ctx.Req.Prompt)
+func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
+	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
+	prompt := []byte(request.Prompt)
 	if len(prompt) < cacheBlockSize {
-		ctx.Logger.V(logutil.DEBUG).Info("Request body too small for prefix cache", "size", len(prompt), "block size", cacheBlockSize)
+		loggerDebug.Info("Request body too small for prefix cache", "size", len(prompt), "block size", cacheBlockSize)
 		return nil
 	}
 	if len(prompt) > cacheBlockSize*maxPrefixBlocks {
-		ctx.Logger.V(logutil.DEBUG).Info("Truncating input", "size", len(prompt), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize)
+		loggerDebug.Info("Truncating input", "size", len(prompt), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize)
 		prompt = prompt[:maxPrefixBlocks*cacheBlockSize]
 	}
 	// Split the body into blocks of size cacheBlockSize. The +1 is to account for the model.
 	// If the last block is smaller than cacheBlockSize, it will be ignored.
 	res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize)
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
-	res = append(res, BlockHash(xxhash.Sum64String(ctx.Req.TargetModel)))
+	res = append(res, BlockHash(xxhash.Sum64String(request.TargetModel)))
 	for i := 0; i+cacheBlockSize <= len(prompt); i += cacheBlockSize {
 		block := prompt[i : i+cacheBlockSize]
 		prevBlockHash := res[len(res)-1]