support vLLM cache salting in prefix aware scorer (kubernetes-sigs#1646)

Frapschen · liu-cong · BenjaminBraunDev · commit 05ec72cdb346 · 2025-10-27T23:13:30.000Z
* support vLLM cache salting in prefix aware scorer

* Apply suggestions from code review

Co-authored-by: Cong Liu &lt;conliu@google.com&gt;

* fix lint

---------

Co-authored-by: Cong Liu &lt;conliu@google.com&gt;
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -263,7 +263,7 @@ func (p *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map
 }
 
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
-// hash(0) is the hash of the model name, since different models generally don't share prefix cache.
+// hash[0] is calculated including the model name and cache_salt(if provided), since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
 func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
@@ -292,6 +292,10 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
 	h := xxhash.New()
 	_, _ = h.Write([]byte(request.TargetModel))
+	if cacheSalt := request.Body.CacheSalt(); cacheSalt != "" {
+		_, _ = h.Write([]byte(cacheSalt))
+	}
+
 	prevBlockHash := BlockHash(h.Sum64())
 	for i := 0; i+cacheBlockSize <= len(userInput); i += cacheBlockSize {
 		h.Reset()
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
@@ -73,13 +73,27 @@ type LLMRequestBody struct {
 	ChatCompletions *ChatCompletionsRequest `json:"chat_completions,omitempty"`
 }
 
+func (r *LLMRequestBody) CacheSalt() string {
+	if r.ChatCompletions == nil && r.Completions == nil {
+		return ""
+	}
+
+	if r.ChatCompletions != nil {
+		return r.ChatCompletions.CacheSalt
+	}
+
+	return r.Completions.CacheSalt
+}
+
 // CompletionsRequest is a structured representation of the fields we parse out of the
 // /v1/completions request body.
 // This struct includes fields usable for plugins and scheduling decisions - and not the entire
 // API spec.
 type CompletionsRequest struct {
 	// Prompt is the prompt that was sent in the request body.
 	Prompt string `json:"prompt,omitempty"`
+	// CacheSalt is an optional request parameter to isolate prefix caches for security reasons.
+	CacheSalt string `json:"cache_salt,omitempty"`
 }
 
 func (r *CompletionsRequest) String() string {
@@ -105,6 +119,8 @@ type ChatCompletionsRequest struct {
 	ContinueFinalMessage      bool                   `json:"continue_final_message,omitempty"`
 	AddGenerationPrompt       bool                   `json:"add_generation_prompt,omitempty"`
 	ChatTemplateKWArgs        map[string]interface{} `json:"chat_template_kwargs,omitempty"`
+	// CacheSalt is an optional request parameter to isolate prefix caches for security reasons.
+	CacheSalt string `json:"cache_salt,omitempty"`
 }
 
 func (r *ChatCompletionsRequest) String() string {
diff --git a/pkg/epp/util/request/body_test.go b/pkg/epp/util/request/body_test.go
@@ -225,6 +225,44 @@ func TestExtractRequestData(t *testing.T) {
 			},
 			wantErr: true,
 		},
+		{
+			name: "completions request with cache_salt",
+			body: map[string]any{
+				"model":      "test",
+				"prompt":     "test prompt",
+				"cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+			},
+			want: &types.LLMRequestBody{
+				Completions: &types.CompletionsRequest{
+					Prompt:    "test prompt",
+					CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+				},
+			},
+		},
+		{
+			name: "chat completions request with cache_salt",
+			body: map[string]any{
+				"model": "test",
+				"messages": []any{
+					map[string]any{
+						"role": "system", "content": "this is a system message",
+					},
+					map[string]any{
+						"role": "user", "content": "hello",
+					},
+				},
+				"cache_salt": "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+			},
+			want: &types.LLMRequestBody{
+				ChatCompletions: &types.ChatCompletionsRequest{
+					Messages: []types.Message{
+						{Role: "system", Content: "this is a system message"},
+						{Role: "user", Content: "hello"},
+					},
+					CacheSalt: "Z3V2bmV3aGxza3ZubGFoZ3Zud3V3ZWZ2bmd0b3V2bnZmc2xpZ3RoZ2x2aQ==",
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {