Use the client type when streaming the data to the client, not when executing completion

jhrozek · jhrozek · commit 29f143b9273d · 2025-02-03T16:15:44.000+01:00
We used to special-case ollama stream generation by passing the client type to the execute_completion. Instead, let's pass the client type to the place that needs special casing using the recently introduce client type enum. Related: #830
diff --git a/src/codegate/providers/anthropic/completion_handler.py b/src/codegate/providers/anthropic/completion_handler.py
@@ -16,7 +16,6 @@ async def execute_completion(
         api_key: Optional[str],
         stream: bool = False,
         is_fim_request: bool = False,
-        base_tool: Optional[str] = "",
     ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
         """
         Ensures the model name is prefixed with 'anthropic/' to explicitly route to Anthropic's API.
diff --git a/src/codegate/providers/anthropic/provider.py b/src/codegate/providers/anthropic/provider.py
@@ -5,6 +5,8 @@
 import structlog
 from fastapi import Header, HTTPException, Request
 
+from codegate.clients.clients import ClientType
+from codegate.clients.detector import DetectClient
 from codegate.pipeline.factory import PipelineFactory
 from codegate.providers.anthropic.adapter import AnthropicInputNormalizer, AnthropicOutputNormalizer
 from codegate.providers.anthropic.completion_handler import AnthropicCompletion
@@ -51,7 +53,13 @@ def models(self, endpoint: str = None, api_key: str = None) -> List[str]:
 
         return [model["id"] for model in respjson.get("data", [])]
 
-    async def process_request(self, data: dict, api_key: str, request_url_path: str):
+    async def process_request(
+        self,
+        data: dict,
+        api_key: str,
+        request_url_path: str,
+        client_type: ClientType,
+    ):
         is_fim_request = self._is_fim_request(request_url_path, data)
         try:
             stream = await self.complete(data, api_key, is_fim_request)
@@ -65,7 +73,7 @@ async def process_request(self, data: dict, api_key: str, request_url_path: str)
             else:
                 # just continue raising the exception
                 raise e
-        return self._completion_handler.create_response(stream)
+        return self._completion_handler.create_response(stream, client_type)
 
     def _setup_routes(self):
         """
@@ -80,6 +88,7 @@ def _setup_routes(self):
 
         @self.router.post(f"/{self.provider_route_name}/messages")
         @self.router.post(f"/{self.provider_route_name}/v1/messages")
+        @DetectClient()
         async def create_message(
             request: Request,
             x_api_key: str = Header(None),
@@ -90,4 +99,9 @@ async def create_message(
             body = await request.body()
             data = json.loads(body)
 
-            return await self.process_request(data, x_api_key, request.url.path)
+            return await self.process_request(
+                data,
+                x_api_key,
+                request.url.path,
+                request.state.detected_client,
+            )
diff --git a/src/codegate/providers/base.py b/src/codegate/providers/base.py
@@ -10,6 +10,7 @@
 from litellm import ModelResponse
 from litellm.types.llms.openai import ChatCompletionRequest
 
+from codegate.clients.clients import ClientType
 from codegate.codegate_logging import setup_logging
 from codegate.db.connection import DbRecorder
 from codegate.pipeline.base import (
@@ -22,7 +23,6 @@
 from codegate.providers.formatting.input_pipeline import PipelineResponseFormatter
 from codegate.providers.normalizer.base import ModelInputNormalizer, ModelOutputNormalizer
 from codegate.providers.normalizer.completion import CompletionNormalizer
-from codegate.utils.utils import get_tool_name_from_messages
 
 setup_logging()
 logger = structlog.get_logger("codegate")
@@ -74,7 +74,13 @@ def models(self, endpoint, str=None, api_key: str = None) -> List[str]:
         pass
 
     @abstractmethod
-    async def process_request(self, data: dict, api_key: str, request_url_path: str):
+    async def process_request(
+        self,
+        data: dict,
+        api_key: str,
+        request_url_path: str,
+        client_type: ClientType,
+    ):
         pass
 
     @property
@@ -287,14 +293,11 @@ async def complete(
         # Execute the completion and translate the response
         # This gives us either a single response or a stream of responses
         # based on the streaming flag
-        base_tool = get_tool_name_from_messages(data)
-
         model_response = await self._completion_handler.execute_completion(
             provider_request,
             api_key=api_key,
             stream=streaming,
             is_fim_request=is_fim_request,
-            base_tool=base_tool,
         )
         if not streaming:
             normalized_response = self._output_normalizer.normalize(model_response)
diff --git a/src/codegate/providers/completion/base.py b/src/codegate/providers/completion/base.py
@@ -6,6 +6,8 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from litellm import ChatCompletionRequest, ModelResponse
 
+from codegate.clients.clients import ClientType
+
 
 class BaseCompletionHandler(ABC):
     """
@@ -20,20 +22,27 @@ async def execute_completion(
         api_key: Optional[str],
         stream: bool = False,  # TODO: remove this param?
         is_fim_request: bool = False,
-        base_tool: Optional[str] = "",
     ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
         """Execute the completion request"""
         pass
 
     @abstractmethod
-    def _create_streaming_response(self, stream: AsyncIterator[Any]) -> StreamingResponse:
+    def _create_streaming_response(
+        self,
+        stream: AsyncIterator[Any],
+        client_type: ClientType = ClientType.GENERIC,
+    ) -> StreamingResponse:
         pass
 
     @abstractmethod
     def _create_json_response(self, response: Any) -> JSONResponse:
         pass
 
-    def create_response(self, response: Any) -> Union[JSONResponse, StreamingResponse]:
+    def create_response(
+        self,
+        response: Any,
+        client_type: ClientType,
+    ) -> Union[JSONResponse, StreamingResponse]:
         """
         Create a FastAPI response from the completion response.
         """
@@ -42,5 +51,5 @@ def create_response(self, response: Any) -> Union[JSONResponse, StreamingRespons
             or isinstance(response, AsyncIterator)
             or inspect.isasyncgen(response)
         ):
-            return self._create_streaming_response(response)
+            return self._create_streaming_response(response, client_type)
         return self._create_json_response(response)
diff --git a/src/codegate/providers/litellmshim/litellmshim.py b/src/codegate/providers/litellmshim/litellmshim.py
@@ -9,6 +9,7 @@
     acompletion,
 )
 
+from codegate.clients.clients import ClientType
 from codegate.providers.base import BaseCompletionHandler, StreamGenerator
 
 logger = structlog.get_logger("codegate")
@@ -43,7 +44,6 @@ async def execute_completion(
         api_key: Optional[str],
         stream: bool = False,
         is_fim_request: bool = False,
-        base_tool: Optional[str] = "",
     ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
         """
         Execute the completion request with LiteLLM's API
@@ -53,7 +53,11 @@ async def execute_completion(
             return await self._fim_completion_func(**request)
         return await self._completion_func(**request)
 
-    def _create_streaming_response(self, stream: AsyncIterator[Any]) -> StreamingResponse:
+    def _create_streaming_response(
+        self,
+        stream: AsyncIterator[Any],
+        _: ClientType = ClientType.GENERIC,
+    ) -> StreamingResponse:
         """
         Create a streaming response from a stream generator. The StreamingResponse
         is the format that FastAPI expects for streaming responses.
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -8,6 +8,7 @@
     CreateChatCompletionStreamResponse,
 )
 
+from codegate.clients.clients import ClientType
 from codegate.config import Config
 from codegate.inference.inference_engine import LlamaCppInferenceEngine
 from codegate.providers.base import BaseCompletionHandler
@@ -52,7 +53,6 @@ async def execute_completion(
         api_key: Optional[str],
         stream: bool = False,
         is_fim_request: bool = False,
-        base_tool: Optional[str] = "",
     ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
         """
         Execute the completion request with inference engine API
@@ -82,7 +82,11 @@ async def execute_completion(
 
         return convert_to_async_iterator(response) if stream else response
 
-    def _create_streaming_response(self, stream: AsyncIterator[Any]) -> StreamingResponse:
+    def _create_streaming_response(
+        self,
+        stream: AsyncIterator[Any],
+        client_type: ClientType = ClientType.GENERIC,
+    ) -> StreamingResponse:
         """
         Create a streaming response from a stream generator. The StreamingResponse
         is the format that FastAPI expects for streaming responses.
diff --git a/src/codegate/providers/llamacpp/provider.py b/src/codegate/providers/llamacpp/provider.py
@@ -4,6 +4,8 @@
 import structlog
 from fastapi import HTTPException, Request
 
+from codegate.clients.clients import ClientType
+from codegate.clients.detector import DetectClient
 from codegate.pipeline.factory import PipelineFactory
 from codegate.providers.base import BaseProvider
 from codegate.providers.llamacpp.completion_handler import LlamaCppCompletionHandler
@@ -33,7 +35,13 @@ def models(self, endpoint: str = None, api_key: str = None) -> List[str]:
         # TODO: Implement file fetching
         return []
 
-    async def process_request(self, data: dict, api_key: str, request_url_path: str):
+    async def process_request(
+        self,
+        data: dict,
+        api_key: str,
+        request_url_path: str,
+        client_type: ClientType,
+    ):
         is_fim_request = self._is_fim_request(request_url_path, data)
         try:
             stream = await self.complete(data, None, is_fim_request=is_fim_request)
@@ -51,7 +59,7 @@ async def process_request(self, data: dict, api_key: str, request_url_path: str)
             else:
                 # just continue raising the exception
                 raise e
-        return self._completion_handler.create_response(stream)
+        return self._completion_handler.create_response(stream, client_type)
 
     def _setup_routes(self):
         """
@@ -61,10 +69,15 @@ def _setup_routes(self):
 
         @self.router.post(f"/{self.provider_route_name}/completions")
         @self.router.post(f"/{self.provider_route_name}/chat/completions")
+        @DetectClient()
         async def create_completion(
             request: Request,
         ):
             body = await request.body()
             data = json.loads(body)
-
-            return await self.process_request(data, None, request.url.path)
+            return await self.process_request(
+                data,
+                None,
+                request.url.path,
+                request.state.detected_client,
+            )
diff --git a/src/codegate/providers/ollama/completion_handler.py b/src/codegate/providers/ollama/completion_handler.py
@@ -6,13 +6,15 @@
 from litellm import ChatCompletionRequest
 from ollama import AsyncClient, ChatResponse, GenerateResponse
 
+from codegate.clients.clients import ClientType
 from codegate.providers.base import BaseCompletionHandler
 
 logger = structlog.get_logger("codegate")
 
 
 async def ollama_stream_generator(  # noqa: C901
-    stream: AsyncIterator[ChatResponse], base_tool: str
+    stream: AsyncIterator[ChatResponse],
+    client_type: ClientType,
 ) -> AsyncIterator[str]:
     """OpenAI-style SSE format"""
     try:
@@ -21,7 +23,7 @@ async def ollama_stream_generator(  # noqa: C901
                 # TODO We should wire in the client info so we can respond with
                 # the correct format and start to handle multiple clients
                 # in a more robust way.
-                if base_tool in ["cline", "kodu"]:
+                if client_type in [ClientType.CLINE, ClientType.KODU]:
                     # First get the raw dict from the chunk
                     chunk_dict = chunk.model_dump()
                     # Create response dictionary in OpenAI-like format
@@ -82,18 +84,15 @@ class OllamaShim(BaseCompletionHandler):
 
     def __init__(self, base_url):
         self.client = AsyncClient(host=base_url, timeout=300)
-        self.base_tool = ""
 
     async def execute_completion(
         self,
         request: ChatCompletionRequest,
         api_key: Optional[str],
         stream: bool = False,
         is_fim_request: bool = False,
-        base_tool: Optional[str] = "",
     ) -> Union[ChatResponse, GenerateResponse]:
         """Stream response directly from Ollama API."""
-        self.base_tool = base_tool
         if is_fim_request:
             prompt = ""
             for i in reversed(range(len(request["messages"]))):
@@ -120,13 +119,17 @@ async def execute_completion(
             )  # type: ignore
         return response
 
-    def _create_streaming_response(self, stream: AsyncIterator[ChatResponse]) -> StreamingResponse:
+    def _create_streaming_response(
+        self,
+        stream: AsyncIterator[ChatResponse],
+        client_type: ClientType,
+    ) -> StreamingResponse:
         """
         Create a streaming response from a stream generator. The StreamingResponse
         is the format that FastAPI expects for streaming responses.
         """
         return StreamingResponse(
-            ollama_stream_generator(stream, self.base_tool or ""),
+            ollama_stream_generator(stream, client_type),
             media_type="application/x-ndjson; charset=utf-8",
             headers={
                 "Cache-Control": "no-cache",
diff --git a/src/codegate/providers/ollama/provider.py b/src/codegate/providers/ollama/provider.py
@@ -5,6 +5,8 @@
 import structlog
 from fastapi import HTTPException, Request
 
+from codegate.clients.clients import ClientType
+from codegate.clients.detector import DetectClient
 from codegate.config import Config
 from codegate.pipeline.factory import PipelineFactory
 from codegate.providers.base import BaseProvider, ModelFetchError
@@ -55,7 +57,13 @@ def models(self, endpoint: str = None, api_key: str = None) -> List[str]:
 
         return [model["name"] for model in jsonresp.get("models", [])]
 
-    async def process_request(self, data: dict, api_key: str, request_url_path: str):
+    async def process_request(
+        self,
+        data: dict,
+        api_key: str,
+        request_url_path: str,
+        client_type: ClientType,
+    ):
         is_fim_request = self._is_fim_request(request_url_path, data)
         try:
             stream = await self.complete(data, api_key=None, is_fim_request=is_fim_request)
@@ -71,7 +79,7 @@ async def process_request(self, data: dict, api_key: str, request_url_path: str)
             else:
                 # just continue raising the exception
                 raise e
-        return self._completion_handler.create_response(stream)
+        return self._completion_handler.create_response(stream, client_type)
 
     def _setup_routes(self):
         """
@@ -117,6 +125,7 @@ async def show_model(request: Request):
         # Cline API routes
         @self.router.post(f"/{self.provider_route_name}/v1/chat/completions")
         @self.router.post(f"/{self.provider_route_name}/v1/generate")
+        @DetectClient()
         async def create_completion(request: Request):
             body = await request.body()
             data = json.loads(body)
@@ -125,4 +134,9 @@ async def create_completion(request: Request):
             # Force it to be the one that comes in the configuration.
             data["base_url"] = self.base_url
 
-            return await self.process_request(data, None, request.url.path)
+            return await self.process_request(
+                data,
+                None,
+                request.url.path,
+                request.state.detected_client,
+            )
diff --git a/src/codegate/providers/openai/provider.py b/src/codegate/providers/openai/provider.py
diff --git a/src/codegate/providers/vllm/provider.py b/src/codegate/providers/vllm/provider.py