Enable the integration tests for vllm (#806)

rdimitrov · Luke Hinds · commit 17e949d84895 · 2025-01-31T17:16:24.000Z
* Add integration tests for vllm Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> * Try using Qwen/Qwen2.5-Coder-3B-Instruct Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> * Go back to using Qwen/Qwen2.5-Coder-0.5B-Instruct Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> * Reformat the vllm_fim test Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> * Use Qwen/Qwen2.5-Coder-0.5B Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> * Revert "Use Qwen/Qwen2.5-Coder-0.5B" This reverts commit 32b2d8c. * Update the expected result for vllm_fim Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com> --------- Signed-off-by: Radoslav Dimitrov <radoslav@stacklok.com>
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -62,6 +62,7 @@ jobs:
             -v "$(pwd)"/codegate_volume:/app/codegate_volume \
             -e CODEGATE_APP_LOG_LEVEL=DEBUG \
             -e CODEGATE_OLLAMA_URL=http://localhost:11434 \
+            -e CODEGATE_VLLM_URL=http://localhost:8000 \
             --restart unless-stopped $DOCKER_IMAGE
 
           # Confirm the container started
@@ -181,7 +182,60 @@ jobs:
         run: |
           docker logs ollama
 
-      - name: Print the container logs (useful for debugging)
+      - name: Build and run the vllm container
+        run: |
+          git clone https://github.com/vllm-project/vllm.git
+          cd vllm
+          docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+          docker run -d  --name vllm \
+             --network="host" \
+             vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B-Instruct
+
+      - name: Verify the vllm container is running
+        run: |
+          echo -e "\nVerify the vllm container is serving\n"
+          docker ps -f name=vllm
+
+          echo "Loop until the endpoint responds successfully"
+          while ! curl --silent --fail --get "http://localhost:8000/ping" >/dev/null; do
+            echo "Ping not available yet. Retrying in 2 seconds..."
+            sleep 2
+          done
+          echo -e "\nPing is now available!\n"
+
+          echo -e "\nVerify the completions endpoint works\n"
+          curl http://localhost:8000/v1/completions -H "Content-Type: application/json"   -d '{
+              "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+              "prompt": ["How to make pizza"],
+              "max_tokens": 100,
+              "temperature": 0
+            }'
+
+          echo -e "\nVerify the chat/completions endpoint works\n"
+          curl -X POST http://localhost:8000/v1/chat/completions \
+              -H "Content-Type: application/json" \
+              -d '{
+                "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
+                "messages": [
+                  {"role": "system", "content": "You are a coding assistant."},
+                  {"role": "user", "content": "Hello"}
+                ],
+                "temperature": 0,
+                "max_tokens": 4096,
+                "extra_body": {}
+              }'
+
+          # Print a new line and then the message in a single echo
+          echo -e "\nPrint the vllm container logs\n"
+          docker logs vllm
+
+      - name: Run integration tests - vllm
+        env:
+          CODEGATE_PROVIDERS: "vllm"
+        run: |
+          poetry run python tests/integration/integration_tests.py
+
+      - name: Print the CodeGate container logs (useful for debugging)
         if: always()
         run: |
           docker logs $CONTAINER_NAME
@@ -194,3 +248,8 @@ jobs:
           echo "DB contents:"
           ls -la codegate_volume/db
           docker exec $CONTAINER_NAME ls -la /app/codegate_volume/db
+
+      - name: Print the vllm container logs (useful for debugging)
+        if: always()
+        run: |
+          docker logs vllm
diff --git a/src/codegate/providers/vllm/adapter.py b/src/codegate/providers/vllm/adapter.py
@@ -102,6 +102,7 @@ def _has_chat_ml_format(data: Dict) -> bool:
         content = input_chat_request["messages"][0]["content"]
         if isinstance(content, str) and "<|im_start|>" in content:
             return True
+        return False
 
     def normalize(self, data: Dict) -> ChatCompletionRequest:
         """
@@ -117,12 +118,6 @@ def normalize(self, data: Dict) -> ChatCompletionRequest:
             if not model_name.startswith("hosted_vllm/"):
                 normalized_data["model"] = f"hosted_vllm/{model_name}"
 
-        # Ensure the base_url ends with /v1 if provided
-        if "base_url" in normalized_data:
-            base_url = normalized_data["base_url"].rstrip("/")
-            if not base_url.endswith("/v1"):
-                normalized_data["base_url"] = f"{base_url}/v1"
-
         ret_data = normalized_data
         if self._has_chat_ml_format(normalized_data):
             ret_data = self._chat_ml_normalizer.normalize(normalized_data)
diff --git a/src/codegate/providers/vllm/provider.py b/src/codegate/providers/vllm/provider.py
@@ -1,4 +1,5 @@
 import json
+from urllib.parse import urljoin
 
 import httpx
 import structlog
@@ -31,6 +32,19 @@ def __init__(
     def provider_route_name(self) -> str:
         return "vllm"
 
+    def _get_base_url(self) -> str:
+        """
+        Get the base URL from config with proper formatting
+        """
+        config = Config.get_config()
+        base_url = config.provider_urls.get("vllm") if config else ""
+        if base_url:
+            base_url = base_url.rstrip("/")
+            # Add /v1 if not present
+            if not base_url.endswith("/v1"):
+                base_url = f"{base_url}/v1"
+        return base_url
+
     def models(self):
         resp = httpx.get(f"{self.base_url}/v1/models")
         jsonresp = resp.json()
@@ -40,60 +54,69 @@ def models(self):
     def _setup_routes(self):
         """
         Sets up the /chat/completions route for the provider as expected by the
-        OpenAI API. Extracts the API key from the "Authorization" header and
-        passes it to the completion handler.
+        OpenAI API. Makes the API key optional in the "Authorization" header.
         """
 
         @self.router.get(f"/{self.provider_route_name}/models")
-        async def get_models(authorization: str = Header(..., description="Bearer token")):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
-
-            token = authorization.split(" ")[1]
-            config = Config.get_config()
-            if config:
-                base_url = config.provider_urls.get("vllm")
-            else:
-                base_url = ""
-
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    f"{base_url}/v1/models", headers={"Authorization": f"Bearer {token}"}
+        async def get_models(
+            authorization: str | None = Header(None, description="Optional Bearer token")
+        ):
+            base_url = self._get_base_url()
+            headers = {}
+
+            if authorization:
+                if not authorization.startswith("Bearer "):
+                    raise HTTPException(
+                        status_code=401, detail="Invalid authorization header format"
+                    )
+                token = authorization.split(" ")[1]
+                headers["Authorization"] = f"Bearer {token}"
+
+            try:
+                models_url = urljoin(base_url, "v1/models")
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(models_url, headers=headers)
+                    response.raise_for_status()
+                    return response.json()
+            except httpx.HTTPError as e:
+                logger = structlog.get_logger("codegate")
+                logger.error("Error fetching vLLM models", error=str(e))
+                raise HTTPException(
+                    status_code=e.response.status_code if hasattr(e, "response") else 500,
+                    detail=str(e),
                 )
-                response.raise_for_status()
-                return response.json()
 
         @self.router.post(f"/{self.provider_route_name}/chat/completions")
         @self.router.post(f"/{self.provider_route_name}/completions")
         async def create_completion(
             request: Request,
-            authorization: str = Header(..., description="Bearer token"),
+            authorization: str | None = Header(None, description="Optional Bearer token"),
         ):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
+            api_key = None
+            if authorization:
+                if not authorization.startswith("Bearer "):
+                    raise HTTPException(
+                        status_code=401, detail="Invalid authorization header format"
+                    )
+                api_key = authorization.split(" ")[1]
 
-            api_key = authorization.split(" ")[1]
             body = await request.body()
             data = json.loads(body)
 
             # Add the vLLM base URL to the request
-            config = Config.get_config()
-            if config:
-                data["base_url"] = config.provider_urls.get("vllm")
-            else:
-                data["base_url"] = ""
+            base_url = self._get_base_url()
+            data["base_url"] = base_url
 
             is_fim_request = self._is_fim_request(request, data)
             try:
+                # Pass the potentially None api_key to complete
                 stream = await self.complete(data, api_key, is_fim_request=is_fim_request)
             except Exception as e:
-                #  check if we have an status code there
+                # Check if we have a status code there
                 if hasattr(e, "status_code"):
                     logger = structlog.get_logger("codegate")
                     logger.error("Error in VLLMProvider completion", error=str(e))
+                    raise HTTPException(status_code=e.status_code, detail=str(e))
+                raise e
 
-                    raise HTTPException(status_code=e.status_code, detail=str(e))  # type: ignore
-                else:
-                    # just continue raising the exception
-                    raise e
             return self._completion_handler.create_response(stream)
diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml
@@ -1,6 +1,6 @@
 headers:
   vllm:
-    Authorization: Bearer ENV_VLLM_KEY
+    Content-Type: application/json
   openai:
     Authorization: Bearer ENV_OPENAI_KEY
   ollama:
@@ -161,40 +161,48 @@ testcases:
               "role":"user"
             }
         ],
-        "model":"Qwen/Qwen2.5-Coder-14B-Instruct",
+        "model":"Qwen/Qwen2.5-Coder-0.5B-Instruct",
         "stream":true,
         "temperature":0
       }
     likes: |
-      Hello! How can I assist you today? If you have any questions about software security, package analysis, or need guidance on secure coding practices, feel free to ask.
+      Hello! How can I assist you today?
 
   vllm_fim:
     name: VLLM FIM
     provider: vllm
     url: http://127.0.0.1:8989/vllm/completions
     data: |
       {
-        "model": "Qwen/Qwen2.5-Coder-14B",
+        "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
         "max_tokens": 4096,
         "temperature": 0,
         "stream": true,
-        "stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```"],
+        "stop": [
+          "<|endoftext|>",
+          "<|fim_prefix|>",
+          "<|fim_middle|>",
+          "<|fim_suffix|>",
+          "<|fim_pad|>",
+          "<|repo_name|>",
+          "<|file_sep|>",
+          "<|im_start|>",
+          "<|im_end|>",
+          "/src/",
+          "#- coding: utf-8",
+          "```"
+        ],
         "prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n    <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>"
       }
     likes: |
-      # Create an instance of the InvokeHTTP class
-      invoke = invokehttp.InvokeHTTP(key)
+      return response.json()
 
-      # Call the API using the invoke_http method
-      response = invoke.invoke_http(url, method='get', data=data)
+      def test_call_api():
+          response = call_api('http://localhost:8080', method='post', data='data')
+          assert response['key1'] == 'test1' and response['key2'] == 'test2', "Test failed"
 
-      # Check the response status code
-      if response.status_code == 200:
-          # The API call was successful
-          print(response.json())
-      else:
-          # The API call failed
-          print('Error:', response.status_code)
+      if __name__ == '__main__':
+          test_call_api()
 
   anthropic_chat:
     name: Anthropic Chat
@@ -333,4 +341,4 @@ testcases:
 
       print(response.status_code)
       print(response.json())
-      ```
+      ```