fix: first hash of prefix cache with same model name (#1341)

livelxw · web-flow · commit 40fdedbe83fe · 2025-08-25T04:35:06.000-07:00
* fix: first hash of prefix cache with same model name

* fix: no hash if the prompt is smaller than cacheBlockSize

* fix: optimize if else for more concise

* chore: clean test comments
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -263,8 +263,16 @@ func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize i
 	// If the last block is smaller than cacheBlockSize, it will be ignored.
 	res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize)
 	// Add the model to the first block hash so that different models have different hashes even with the same body.
-	res = append(res, BlockHash(xxhash.Sum64String(request.TargetModel)))
-	for i := 0; i+cacheBlockSize <= len(prompt); i += cacheBlockSize {
+
+	firstBlockSize := cacheBlockSize
+	if len(prompt) < cacheBlockSize {
+		firstBlockSize = len(prompt)
+	}
+	firstBlock := prompt[0:firstBlockSize]
+	firstBlockWithModel := append([]byte(request.TargetModel), firstBlock...)
+	res = append(res, BlockHash(xxhash.Sum64(firstBlockWithModel)))
+
+	for i := cacheBlockSize; i+cacheBlockSize <= len(prompt); i += cacheBlockSize {
 		block := prompt[i : i+cacheBlockSize]
 		prevBlockHash := res[len(res)-1]
 		block = append(block, toBytes(prevBlockHash)...)
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -57,8 +57,8 @@ func TestPrefixPlugin(t *testing.T) {
 	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 6, hash block size is 4, the last 2 characters are ignored.
-	// Total hashes = 2 (the first one is for the model)
-	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
+	// Total hashes = 1 (the first one is for the prefix with model)
+	assert.Equal(t, 1, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers")
 	assert.Equal(t, float64(0), scores[pod1], "score for pod1")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2")
@@ -84,8 +84,8 @@ func TestPrefixPlugin(t *testing.T) {
 	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 6, hash block size is 4, the last 2 characters are ignored.
-	// Total hashes = 2 (the first one is for the model)
-	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
+	// Total hashes = 1 (the first one is for the prefix with model)
+	assert.Equal(t, 1, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers")
 	assert.Equal(t, float64(0), scores[pod1], "score for pod1")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2")
@@ -110,10 +110,10 @@ func TestPrefixPlugin(t *testing.T) {
 	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 8, hash block size is 4, so 2 hashes will be calculated.
-	// Total hashes = 3 (the first one is for the model)
-	assert.Equal(t, 3, len(state.PrefixHashes), "number of hashes is incorrect")
+	// Total hashes = 2 (the first one is for the prefix with model)
+	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 1, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix")
-	assert.Equal(t, float64(2)/float64(3), scores[pod1], "score should be 2/3 - the model and the first prefix block match")
+	assert.Equal(t, 0.5, scores[pod1], "score should be 0.5 - the model and the first prefix block match")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2")
 
 	schedulingResult = &types.SchedulingResult{
@@ -135,8 +135,8 @@ func TestPrefixPlugin(t *testing.T) {
 	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 8, hash block size is 4, so 2 hashes will be calculated.
-	// Total hashes = 3 (the first one is for the model)
-	assert.Equal(t, 3, len(state.PrefixHashes), "number of hashes is incorrect")
+	// Total hashes = 2 (the first one is for the prefix with model)
+	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 0, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix")
 	assert.Equal(t, float64(0), scores[pod1], "score for pod1")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2")
@@ -160,10 +160,10 @@ func TestPrefixPlugin(t *testing.T) {
 	assert.NoError(t, err)
 	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
 	// Input size is 12, hash block size is 4, so 3 hashes will be calculated.
-	// Total hashes = 4 (the first one is for the model)
-	assert.Equal(t, 4, len(state.PrefixHashes), "number of hashes is incorrect")
+	// Total hashes = 3 (the first one is for the prefix with model)
+	assert.Equal(t, 3, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 1, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix")
-	assert.Equal(t, 0.75, scores[pod1], "score should be 0.75 - the model and the first 2 prefix blocks match")
+	assert.Equal(t, 2./3, scores[pod1], "score should be 2./3 - the model and the first 2 prefix blocks match")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2")
 
 	schedulingResult = &types.SchedulingResult{
@@ -224,7 +224,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) {
 		// Second cycle: validate internal state
 		state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req.RequestId, PrefixCachePluginType)
 		assert.NoError(b, err)
-		expectedHashes := int(math.Min(float64(maxPrefixBlocks+1), float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model.
+		expectedHashes := int(math.Min(float64(maxPrefixBlocks), float64(len(req.Prompt)/blockSize)))
 		assert.Equal(b, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect")
 	}
 }