Support llamacpp provider in muxing (#889)

aponcedeleonch · web-flow · commit 4e032d90ef85 · 2025-02-04T07:47:49.000Z
Closes: #883 There were a couple of nits that prevented support for llamacpp: 1. The `models` method was not implemented in the provider 2. A way of specifying the model outside `CompletionHandler` These PR takes care of both
diff --git a/src/codegate/config.py b/src/codegate/config.py
@@ -21,6 +21,7 @@
     "vllm": "http://localhost:8000",  # Base URL without /v1 path
     "ollama": "http://localhost:11434",  # Default Ollama server URL
     "lm_studio": "http://localhost:1234",
+    "llamacpp": "./codegate_volume/models",  # Default LlamaCpp model path
 }
 
 
diff --git a/src/codegate/muxing/adapter.py b/src/codegate/muxing/adapter.py
@@ -104,6 +104,8 @@ def __init__(self):
             db_models.ProviderType.ollama: self._format_ollama,
             db_models.ProviderType.openai: self._format_openai,
             db_models.ProviderType.anthropic: self._format_antropic,
+            # Our Lllamacpp provider emits OpenAI chunks
+            db_models.ProviderType.llamacpp: self._format_openai,
         }
 
     def _format_ollama(self, chunk: str) -> str:
diff --git a/src/codegate/providers/crud/crud.py b/src/codegate/providers/crud/crud.py
@@ -365,6 +365,7 @@ def __provider_endpoint_from_cfg(
 
 
 def provider_default_endpoints(provider_type: str) -> str:
+    # TODO: These providers default endpoints should come from config.py
     defaults = {
         "openai": "https://api.openai.com",
         "anthropic": "https://api.anthropic.com",
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -57,13 +57,16 @@ async def execute_completion(
         """
         Execute the completion request with inference engine API
         """
-        model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"
+        model_path = f"{request['base_url']}/{request['model']}.gguf"
 
         # Create a copy of the request dict and remove stream_options
         # Reason - Request error as JSON:
         # {'error': "Llama.create_completion() got an unexpected keyword argument 'stream_options'"}
         request_dict = dict(request)
         request_dict.pop("stream_options", None)
+        # Remove base_url from the request dict. We use this field as a standard across
+        # all providers to specify the base URL of the model.
+        request_dict.pop("base_url", None)
 
         if is_fim_request:
             response = await self.inference_engine.complete(
diff --git a/src/codegate/providers/llamacpp/provider.py b/src/codegate/providers/llamacpp/provider.py
@@ -1,11 +1,13 @@
 import json
+from pathlib import Path
 from typing import List
 
 import structlog
 from fastapi import HTTPException, Request
 
+from codegate.config import Config
 from codegate.pipeline.factory import PipelineFactory
-from codegate.providers.base import BaseProvider
+from codegate.providers.base import BaseProvider, ModelFetchError
 from codegate.providers.llamacpp.completion_handler import LlamaCppCompletionHandler
 from codegate.providers.llamacpp.normalizer import LLamaCppInputNormalizer, LLamaCppOutputNormalizer
 
@@ -30,8 +32,16 @@ def provider_route_name(self) -> str:
         return "llamacpp"
 
     def models(self, endpoint: str = None, api_key: str = None) -> List[str]:
-        # TODO: Implement file fetching
-        return []
+        models_path = Path(Config.get_config().model_base_path)
+        if not models_path.is_dir():
+            raise ModelFetchError(f"llamacpp model path does not exist: {models_path}")
+
+        # return all models except the all-minilm-L6-v2-q5_k_m model which we use for embeddings
+        return [
+            model.stem
+            for model in models_path.glob("*.gguf")
+            if model.is_file() and model.stem != "all-minilm-L6-v2-q5_k_m"
+        ]
 
     async def process_request(self, data: dict, api_key: str, request_url_path: str):
         is_fim_request = self._is_fim_request(request_url_path, data)
@@ -66,5 +76,6 @@ async def create_completion(
         ):
             body = await request.body()
             data = json.loads(body)
+            data["base_url"] = Config.get_config().model_base_path
 
             return await self.process_request(data, None, request.url.path)

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@`
`21`	`21`	`"vllm": "http://localhost:8000", # Base URL without /v1 path`
`22`	`22`	`"ollama": "http://localhost:11434", # Default Ollama server URL`
`23`	`23`	`"lm_studio": "http://localhost:1234",`
	`24`	`+ "llamacpp": "./codegate_volume/models", # Default LlamaCpp model path`
`24`	`25`	`}`
`25`	`26`
`26`	`27`
Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,8 @@ def __init__(self):`
`104`	`104`	`db_models.ProviderType.ollama: self._format_ollama,`
`105`	`105`	`db_models.ProviderType.openai: self._format_openai,`
`106`	`106`	`db_models.ProviderType.anthropic: self._format_antropic,`
	`107`	`+ # Our Lllamacpp provider emits OpenAI chunks`
	`108`	`+ db_models.ProviderType.llamacpp: self._format_openai,`
`107`	`109`	`}`
`108`	`110`
`109`	`111`	`def _format_ollama(self, chunk: str) -> str:`