Fix conversation chaining: skip model routing when previous_response_id is present

Copilot · rootfs · Copilot · commit 37f581447e71 · 2025-10-03T00:08:30.000Z
When a Responses API request includes previous_response_id, the router now skips
model routing to ensure conversation continuity. This prevents routing subsequent
requests to different backend instances that don't have the conversation state.

- Added check for previous_response_id in handleResponsesAPIRequest
- Skip classification and model routing when conversation is chained
- Added test for this behavior (TestHandleResponsesAPIRequest_WithPreviousResponseID)
- Updated documentation to explain the limitation and recommended usage

Co-authored-by: rootfs &lt;7062400+rootfs@users.noreply.github.com&gt;
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -521,6 +521,23 @@ func (r *OpenAIRouter) handleResponsesAPIRequest(v *ext_proc.ProcessingRequest_R
 		ctx.RequestModel = originalModel
 	}
 
+	// Check if this is a chained conversation (has previous_response_id)
+	// If so, we cannot change the model as the conversation state is tied to a specific backend instance
+	hasPreviousResponseID := responsesRequest.PreviousResponseID.Valid() && responsesRequest.PreviousResponseID.Value != ""
+	if hasPreviousResponseID {
+		observability.Infof("Responses API - Request has previous_response_id, skipping model routing to maintain conversation continuity")
+		// Return a pass-through response without model changes
+		return &ext_proc.ProcessingResponse{
+			Response: &ext_proc.ProcessingResponse_RequestBody{
+				RequestBody: &ext_proc.BodyResponse{
+					Response: &ext_proc.CommonResponse{
+						Status: ext_proc.CommonResponse_CONTINUE,
+					},
+				},
+			},
+		}, nil
+	}
+
 	// Get content from input field
 	userContent, nonUserMessages := extractContentFromResponsesInput(responsesRequest)
 	observability.Infof("Responses API - Extracted user content length: %d, non-user messages count: %d", len(userContent), len(nonUserMessages))
diff --git a/src/semantic-router/pkg/extproc/responses_api_test.go b/src/semantic-router/pkg/extproc/responses_api_test.go
@@ -429,3 +429,62 @@ func TestHandleRequestHeaders_ResponsesAPI_ExcludeInputItems(t *testing.T) {
 		})
 	}
 }
+
+func TestHandleResponsesAPIRequest_WithPreviousResponseID(t *testing.T) {
+	// Create a test router
+	cfg := &config.RouterConfig{
+		VLLMEndpoints: []config.VLLMEndpoint{
+			{
+				Name:    "primary",
+				Address: "127.0.0.1",
+				Port:    8000,
+				Models:  []string{"gpt-4o", "deepseek-v3"},
+				Weight:  1,
+			},
+		},
+		DefaultModel: "gpt-4o",
+	}
+
+	// Create a minimal cache backend
+	cacheBackend, _ := cache.NewCacheBackend(cache.CacheConfig{
+		BackendType: cache.InMemoryCacheType,
+		Enabled:     false,
+	})
+
+	router := &OpenAIRouter{
+		Config: cfg,
+		Cache:  cacheBackend,
+	}
+
+	// Test with previous_response_id - should NOT change model even with "auto"
+	requestBody := []byte(`{
+		"model": "auto",
+		"input": "Continue from where we left off",
+		"previous_response_id": "resp_abc123"
+	}`)
+
+	ctx := &RequestContext{
+		Headers:             make(map[string]string),
+		IsResponsesAPI:      true,
+		OriginalRequestBody: requestBody,
+		RequestID:           "test-request-456",
+	}
+
+	requestBodyMsg := &ext_proc.ProcessingRequest_RequestBody{
+		RequestBody: &ext_proc.HttpBody{
+			Body: requestBody,
+		},
+	}
+
+	response, err := router.handleResponsesAPIRequest(requestBodyMsg, ctx, false)
+
+	// Should succeed and return CONTINUE without modifying the request
+	assert.NoError(t, err)
+	assert.NotNil(t, response)
+	assert.NotNil(t, response.GetRequestBody())
+	assert.Equal(t, ext_proc.CommonResponse_CONTINUE, response.GetRequestBody().Response.Status)
+	
+	// Should NOT have body mutation (no model change)
+	assert.Nil(t, response.GetRequestBody().Response.BodyMutation, "Should not modify body when previous_response_id is present")
+}
+
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
@@ -359,7 +359,7 @@ The router will still perform classification and routing, but the actual executi
 
 - GET `/v1/responses/{id}` requests pass through without modification (no routing or classification)
 - POST `/v1/responses` requests go through the full routing pipeline
-- The `previous_response_id` parameter is preserved during routing for conversation continuity
+- **Conversation Chaining Limitation**: When using `previous_response_id` to chain conversations, the router will **not** change the model to ensure conversation continuity. This is because response state is stored on specific backend instances. For multi-turn conversations, specify a fixed model instead of using "auto", or ensure all backend instances share response storage.
 - All Responses API features (tools, reasoning, streaming, background) work transparently through the router
 
 ## Routing Headers