generalize scheduling cycle state concept (#818)

nirrozenbaum · web-flow · commit bc29bd0138e9 · 2025-05-11T09:27:14.000-07:00
* generalize scheduling cycle state concept

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

* typo

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

* make linter happy

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

* make prefix state struct internal to package instead of public

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;

---------

Signed-off-by: Nir Rozenbaum &lt;nirro@il.ibm.com&gt;
diff --git a/pkg/epp/scheduling/plugins/prefix/plugin.go b/pkg/epp/scheduling/plugins/prefix/plugin.go
@@ -78,21 +78,37 @@ type Indexer interface {
 	Add(hashes []BlockHash, server ServerID)
 }
 
+// BlockHash is a hash of the block of request body.
+type BlockHash uint64
+
+type ServerID k8stypes.NamespacedName
+
+func (s ServerID) String() string {
+	return k8stypes.NamespacedName(s).String()
+}
+
+var _ types.StateData = &schedulingContextState{}
+
 // This is the state of this plugin to be used during a scheduling cycle.
-type SchedulingContextState struct {
+type schedulingContextState struct {
 	// PrefixHashes is a list of prefix hashes of the request prompt broken into blocks.
 	PrefixHashes []BlockHash
 	// A map of server to its longest prefix cache match length.
 	PrefixCacheServers map[ServerID]int
 }
 
-// BlockHash is a hash of the block of request body.
-type BlockHash uint64
-
-type ServerID k8stypes.NamespacedName
+func (s *schedulingContextState) Clone() types.StateData {
+	prefixHashes := make([]BlockHash, len(s.PrefixHashes))
+	copy(prefixHashes, s.PrefixHashes)
+	prefixCacheServers := make(map[ServerID]int, len(s.PrefixCacheServers))
+	for key, value := range s.PrefixCacheServers {
+		prefixCacheServers[key] = value
+	}
 
-func (s ServerID) String() string {
-	return k8stypes.NamespacedName(s).String()
+	return &schedulingContextState{
+		PrefixHashes:       prefixHashes,
+		PrefixCacheServers: prefixCacheServers,
+	}
 }
 
 func New(config Config) *Plugin {
@@ -104,31 +120,43 @@ func New(config Config) *Plugin {
 }
 
 func (m *Plugin) Name() string {
-	return "prefixCache"
+	return "prefix-cache"
 }
 
 func (m *Plugin) PreSchedule(ctx *types.SchedulingContext) {
 	hashes := hashPrompt(ctx, m.HashBlockSize, m.MaxPrefixBlocksToMatch)
-	state := SchedulingContextState{
+	state := &schedulingContextState{
 		PrefixHashes:       hashes,
 		PrefixCacheServers: m.matchLongestPrefix(ctx, hashes, DefaultNumServersToMatch),
 	}
-	ctx.SetPluginState(types.PluginName(m.Name()), state)
+
+	ctx.CycleState.Write(types.StateKey(m.Name()), state)
 	ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("PreSchedule, cached servers: %+v", state.PrefixCacheServers), "hashes", state.PrefixHashes)
 }
 
 // If a request was routed to a server, record it in the cache:
 func (m *Plugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) {
 	targetPod := res.TargetPod.GetPod()
-	state := ctx.GetPluginState(types.PluginName(m.Name())).(SchedulingContextState)
+	state, err := m.getPrefixState(ctx.CycleState)
+	if err != nil {
+		ctx.Logger.Error(err, "failed to read prefix plugin cycle state")
+		return
+	}
 	m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName))
 	total := len(state.PrefixHashes)
 	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
 	metrics.RecordPrefixCacheMatch(matchLen*m.HashBlockSize, total*m.HashBlockSize)
 }
 
 func (m *Plugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 {
-	state := ctx.GetPluginState(types.PluginName(m.Name())).(SchedulingContextState)
+	scores := make(map[types.Pod]float64, len(pods))
+
+	state, err := m.getPrefixState(ctx.CycleState)
+	if err != nil {
+		ctx.Logger.Error(err, "failed to read prefix plugin cycle state")
+		return scores
+	}
+
 	total := len(state.PrefixHashes)
 	podScoreFunc := func(pod types.Pod) float64 {
 		if total == 0 {
@@ -138,7 +166,6 @@ func (m *Plugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types
 		return float64(matchLen) / float64(total)
 	}
 
-	scores := make(map[types.Pod]float64, len(pods))
 	for _, pod := range pods {
 		scores[pod] = podScoreFunc(pod)
 	}
@@ -170,6 +197,21 @@ func (m *Plugin) matchLongestPrefix(ctx *types.SchedulingContext, hashes []Block
 	return res
 }
 
+func (m *Plugin) getPrefixState(cycleState *types.CycleState) (*schedulingContextState, error) {
+	prefixStateKey := types.StateKey(m.Name())
+	state, err := cycleState.Read(prefixStateKey)
+	if err != nil {
+		return nil, fmt.Errorf("failed reading %q from CycleState: %w", prefixStateKey, err)
+	}
+
+	prefixSchedulingState, ok := state.(*schedulingContextState)
+	if !ok {
+		return nil, fmt.Errorf("invalid Prefix state, got type %T", state)
+	}
+
+	return prefixSchedulingState, nil
+}
+
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
 // hash(0) is the hash of the model name, since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
diff --git a/pkg/epp/scheduling/plugins/prefix/plugin_test.go b/pkg/epp/scheduling/plugins/prefix/plugin_test.go
@@ -30,7 +30,8 @@ func TestPrefixPlugin(t *testing.T) {
 	}
 	ctx := types.NewSchedulingContext(context.Background(), req1, pods)
 	plugin.PreSchedule(ctx)
-	state := ctx.GetPluginState(types.PluginName(plugin.Name())).(SchedulingContextState)
+	state, err := plugin.getPrefixState(ctx.CycleState)
+	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 6, hash block size is 4, the last 2 characters are ignored.
 	// Total hashes = 2 (the first one is for the model)
@@ -54,7 +55,8 @@ func TestPrefixPlugin(t *testing.T) {
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req2, pods)
 	plugin.PreSchedule(ctx)
-	state = ctx.GetPluginState(types.PluginName(plugin.Name())).(SchedulingContextState)
+	state, err = plugin.getPrefixState(ctx.CycleState)
+	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 6, hash block size is 4, the last 2 characters are ignored.
 	// Total hashes = 2 (the first one is for the model)
@@ -77,7 +79,8 @@ func TestPrefixPlugin(t *testing.T) {
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req3, pods)
 	plugin.PreSchedule(ctx)
-	state = ctx.GetPluginState(types.PluginName(plugin.Name())).(SchedulingContextState)
+	state, err = plugin.getPrefixState(ctx.CycleState)
+	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 8, hash block size is 4, so 2 hashes will be calculated.
 	// Total hashes = 3 (the first one is for the model)
@@ -99,7 +102,8 @@ func TestPrefixPlugin(t *testing.T) {
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req4, pods)
 	plugin.PreSchedule(ctx)
-	state = ctx.GetPluginState(types.PluginName(plugin.Name())).(SchedulingContextState)
+	state, err = plugin.getPrefixState(ctx.CycleState)
+	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 8, hash block size is 4, so 2 hashes will be calculated.
 	// Total hashes = 3 (the first one is for the model)
@@ -121,7 +125,8 @@ func TestPrefixPlugin(t *testing.T) {
 	}
 	ctx = types.NewSchedulingContext(context.Background(), req5, pods)
 	plugin.PreSchedule(ctx)
-	state = ctx.GetPluginState(types.PluginName(plugin.Name())).(SchedulingContextState)
+	state, err = plugin.getPrefixState(ctx.CycleState)
+	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 12, hash block size is 4, so 3 hashes will be calculated.
 	// Total hashes = 4 (the first one is for the model)
diff --git a/pkg/epp/scheduling/types/cycle_state.go b/pkg/epp/scheduling/types/cycle_state.go
@@ -0,0 +1,92 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package types
+
+import (
+	"errors"
+	"sync"
+)
+
+var (
+	// ErrNotFound is the not found error message.
+	ErrNotFound = errors.New("not found")
+)
+
+// StateData is a generic type for arbitrary data stored in CycleState.
+type StateData interface {
+	// Clone is an interface to make a copy of StateData.
+	Clone() StateData
+}
+
+// StateKey is the type of keys stored in CycleState.
+type StateKey string
+
+// NewCycleState initializes a new CycleState and returns its pointer.
+func NewCycleState() *CycleState {
+	return &CycleState{}
+}
+
+// CycleState provides a mechanism for plugins to store and retrieve arbitrary data.
+// StateData stored by one plugin can be read, altered, or deleted by another plugin.
+// CycleState does not provide any data protection, as all plugins are assumed to be
+// trusted.
+// Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios.
+type CycleState struct {
+	// key: StateKey, value: StateData
+	storage sync.Map
+}
+
+// Clone creates a copy of CycleState and returns its pointer. Clone returns
+// nil if the context being cloned is nil.
+func (c *CycleState) Clone() *CycleState {
+	if c == nil {
+		return nil
+	}
+	copy := NewCycleState()
+	// Safe copy storage in case of overwriting.
+	c.storage.Range(func(k, v interface{}) bool {
+		copy.storage.Store(k, v.(StateData).Clone())
+		return true
+	})
+
+	return copy
+}
+
+// Read retrieves data with the given "key" from CycleState. If the key is not
+// present, ErrNotFound is returned.
+//
+// See CycleState for notes on concurrency.
+func (c *CycleState) Read(key StateKey) (StateData, error) {
+	if v, ok := c.storage.Load(key); ok {
+		return v.(StateData), nil
+	}
+	return nil, ErrNotFound
+}
+
+// Write stores the given "val" in CycleState with the given "key".
+//
+// See CycleState for notes on concurrency.
+func (c *CycleState) Write(key StateKey, val StateData) {
+	c.storage.Store(key, val)
+}
+
+// Delete deletes data with the given key from CycleState.
+//
+// See CycleState for notes on concurrency.
+func (c *CycleState) Delete(key StateKey) {
+	c.storage.Delete(key)
+}
diff --git a/pkg/epp/scheduling/types/scheduling_context.go b/pkg/epp/scheduling/types/scheduling_context.go
@@ -0,0 +1,46 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package types
+
+import (
+	"context"
+
+	"github.com/go-logr/logr"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+func NewSchedulingContext(ctx context.Context, req *LLMRequest, pods []Pod) *SchedulingContext {
+	logger := log.FromContext(ctx).WithValues("request", req)
+	return &SchedulingContext{
+		Context:      ctx,
+		Logger:       logger,
+		Req:          req,
+		PodsSnapshot: pods,
+		CycleState:   NewCycleState(),
+	}
+}
+
+// SchedulingContext holds contextual information during a scheduling operation.
+type SchedulingContext struct {
+	context.Context
+	Logger       logr.Logger
+	Req          *LLMRequest
+	PodsSnapshot []Pod
+	// CycleState can be used by plugins to store state during a scheduling cycle, to communicate
+	// between different extension points.
+	CycleState *CycleState
+}
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
@@ -17,12 +17,8 @@ limitations under the License.
 package types
 
 import (
-	"context"
 	"fmt"
-	"sync"
 
-	"github.com/go-logr/logr"
-	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 )
@@ -57,32 +53,6 @@ type ScoredPod struct {
 	Score float64
 }
 
-// SchedulingContext holds contextual information during a scheduling operation.
-type SchedulingContext struct {
-	context.Context
-	Logger       logr.Logger
-	Req          *LLMRequest
-	PodsSnapshot []Pod
-	// PluginState can be used by plugins to store state during a scheduling cycle, to communicate
-	// between different extension points.
-	PluginState   map[PluginName]any
-	pluginStateMu *sync.RWMutex
-}
-
-func (sc *SchedulingContext) GetPluginState(pluginName PluginName) any {
-	sc.pluginStateMu.RLock()
-	defer sc.pluginStateMu.RUnlock()
-	return sc.PluginState[pluginName]
-}
-
-func (sc *SchedulingContext) SetPluginState(pluginName PluginName, state any) {
-	sc.pluginStateMu.Lock()
-	defer sc.pluginStateMu.Unlock()
-	sc.PluginState[pluginName] = state
-}
-
-type PluginName string
-
 func (pm *PodMetrics) String() string {
 	if pm == nil {
 		return ""
@@ -103,18 +73,6 @@ type PodMetrics struct {
 	*backendmetrics.Metrics
 }
 
-func NewSchedulingContext(ctx context.Context, req *LLMRequest, pods []Pod) *SchedulingContext {
-	logger := log.FromContext(ctx).WithValues("request", req)
-	return &SchedulingContext{
-		Context:       ctx,
-		Logger:        logger,
-		Req:           req,
-		PodsSnapshot:  pods,
-		PluginState:   make(map[PluginName]any),
-		pluginStateMu: &sync.RWMutex{},
-	}
-}
-
 func ToSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []Pod {
 	pm := make([]Pod, 0, len(pods))
 	for _, pod := range pods {