vllm-project · Shaoting-Feng · Apr 8, 2025 · Apr 8, 2025
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,20 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
-
-keywords:
-  - vllm
-  - lora
-  - adapter
-home: https://github.com/vllm-project/production-stack
-sources:
-  - https://github.com/vllm-project/production-stack
+version: 0.1.1
 
 maintainers:
   - name: apostac
-
-# Specifies that CRDs should be created/updated first
-annotations:
-  "helm.sh/hook": pre-install,pre-upgrade
-  "helm.sh/hook-weight": "-5"
diff --git a/helm/templates/deployment-lora-controller.yaml b/helm/templates/deployment-lora-controller.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.loraController.enabled -}}
+{{- if and .Values.loraController .Values.loraController.enabled -}}
 apiVersion: apps/v1
 kind: Deployment
 metadata:

diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -54,6 +54,7 @@ spec:
       containers:
         - name: "vllm"
           image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"
+
           command:
           - "vllm"
           - "serve"
@@ -65,6 +66,13 @@ spec:
           {{- if $modelSpec.enableLoRA }}
           - "--enable-lora"
           {{- end }}
+          {{- if $modelSpec.enableTool }}
+          - "--enable-auto-tool-choice"
+          {{- end }}
+          {{- if $modelSpec.toolCallParser }}
+          - "--tool-call-parser"
+          - {{ $modelSpec.toolCallParser | quote }}
+          {{- end }}
           {{- with $modelSpec.vllmConfig }}
           {{-   if hasKey . "enableChunkedPrefill" }}
           - "--enable-chunked-prefill"
@@ -99,7 +107,7 @@ spec:
           {{- end }}
           {{- if $modelSpec.chatTemplate }}
           - "--chat-template"
-          - "/chat_templates/chat-template.jinga"
+          - {{ $modelSpec.chatTemplate | quote }}
           {{- end }}
           {{- if .Values.servingEngineSpec.containerSecurityContext }}
           securityContext:
@@ -194,23 +202,19 @@ spec:
           - name: {{ .Release.Name }}-storage
             mountPath: /data
           {{- end }}
-          {{- if .Values.loraController.enabled }}
-          - name: shared-model-storage
-            mountPath: /models
-          {{- end }}
           {{- with $modelSpec.vllmConfig }}
           {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
           - name: shm
             mountPath: /dev/shm
           {{- end}}
           {{- end}}
           {{- if $modelSpec.chatTemplate }}
-          - name: {{ .Release.Name }}-chat-templates
-            mountPath: /chat_templates
-          {{- end}}
+          - name: vllm-templates
+            mountPath: /templates
+          {{- end }}
           {{- if hasKey $modelSpec "extraVolumeMounts" }}
           {{- toYaml $modelSpec.extraVolumeMounts | nindent 10 }}
-          {{- end}}
+          {{- end }}
       {{- if $modelSpec.imagePullSecret }}
       imagePullSecrets:
         - name: {{ $modelSpec.imagePullSecret }}
@@ -223,11 +227,6 @@ spec:
           persistentVolumeClaim:
             claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"
         {{- end }}
-        {{- if .Values.loraController.enabled }}
-        - name: shared-model-storage
-          persistentVolumeClaim:
-            claimName: {{ .Release.Name }}-shared-storage-claim
-        {{- end }}
         {{- with $modelSpec.vllmConfig }}
         {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
         - name: shm
@@ -237,9 +236,15 @@ spec:
         {{- end}}
         {{- end}}
         {{- if $modelSpec.chatTemplate}}
+        {{- if hasKey $modelSpec "chatTemplateConfigMap" }}
         - name: {{ .Release.Name }}-chat-templates
           configMap:
             name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
+        {{- else }}
+        - name: vllm-templates
+          persistentVolumeClaim:
+            claimName: vllm-templates-pvc
+        {{- end }}
         {{- end}}
         {{- if hasKey $modelSpec "extraVolumes" }}
         {{- toYaml $modelSpec.extraVolumes | nindent 8 }}
@@ -266,16 +271,16 @@ spec:
             {{- toYaml . | nindent 12 }}
             {{- end }}
       {{- end }}
-{{- if $modelSpec.chatTemplate }}
+{{- if and $modelSpec.chatTemplate (hasKey $modelSpec "chatTemplateConfigMap") }}
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
   name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
   namespace: "{{ .Release.Namespace }}"
 data:
-  chat-template.jinga: |-
-    {{ $modelSpec.chatTemplate}}
+  {{ $modelSpec.chatTemplate }}: |-
+    {{ $modelSpec.chatTemplateConfigMap }}
 {{- end }}
 {{- end }}
 ---

diff --git a/helm/templates/loraadapter-crd.yaml b/helm/templates/loraadapter-crd.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.loraController.enabled -}}
+{{- if and .Values.loraController .Values.loraController.enabled -}}
 ---
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition

diff --git a/helm/templates/rbac-lora-controller.yaml b/helm/templates/rbac-lora-controller.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.loraController.enabled -}}
+{{- if and .Values.loraController .Values.loraController.enabled -}}
 ---
 apiVersion: v1
 kind: ServiceAccount

diff --git a/helm/templates/shared-storage.yaml b/helm/templates/shared-storage.yaml
@@ -1,4 +1,4 @@
-{{- if or .Values.sharedStorage.enabled .Values.loraController.enabled }}
+{{- if or (and .Values.loraController .Values.loraController.enabled) (and .Values.sharedStorage .Values.sharedStorage.enabled) }}
 ---
 apiVersion: v1
 kind: PersistentVolume

diff --git a/helm/values.yaml b/helm/values.yaml
@@ -329,41 +329,3 @@ routerSpec:
   #  httpGet:
   #    # -- Path to access on the HTTP server
   #
-
-loraController:
-  enabled: false
-  replicaCount: 1
-  image:
-    repository: lmcache/lora-controller
-    tag: latest
-    pullPolicy: Always
-  resources:
-    limits:
-      cpu: 500m
-      memory: 512Mi
-    requests:
-      cpu: 100m
-      memory: 128Mi
-  podAnnotations: {}
-  podSecurityContext: {}
-  securityContext: {}
-  nodeSelector: {}
-  tolerations: []
-  affinity: {}
-  extraEnv: []
-  extraVolumes: []
-  extraVolumeMounts: []
-  imagePullSecrets: []
-
-# Shared storage configuration for LoRA adapters
-sharedStorage:
-  enabled: false
-  storageClass: "standard"
-  size: "100Gi"
-  accessModes:
-    - ReadWriteMany
-  # Use either hostPath or nfs configuration
-  hostPath: ""
-  nfs:
-    server: ""
-    path: ""
diff --git a/scripts/setup_vllm_templates.sh b/scripts/setup_vllm_templates.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+# Configuration
+PV_NAME="vllm-templates-pv"  # Fixed name for template storage
+PVC_NAME="vllm-templates-pvc"  # Fixed name for template storage
+STORAGE_SIZE="1Gi"
+STORAGE_CLASS="standard"
+HOST_TEMPLATES_DIR="/mnt/templates"  # This is where the PV will be mounted on the host
+TEMP_POD_NAME="vllm-templates-setup"
+
+echo "Setting up vLLM templates..."
+echo "Using PV: $PV_NAME"
+echo "Using PVC: $PVC_NAME"
+
+# Check if host directory exists and create if needed
+echo "Checking host directory at $HOST_TEMPLATES_DIR..."
+if [ ! -d "$HOST_TEMPLATES_DIR" ]; then
+    echo "Creating host directory at $HOST_TEMPLATES_DIR..."
+    sudo mkdir -p "$HOST_TEMPLATES_DIR"
+    sudo chmod 777 "$HOST_TEMPLATES_DIR"  # Allow read/write access
+fi
+
+# Verify directory permissions
+if [ ! -w "$HOST_TEMPLATES_DIR" ]; then
+    echo "Error: No write permission to $HOST_TEMPLATES_DIR"
+    echo "Please ensure you have sudo access or the directory has proper permissions"
+    exit 1
+fi
+
+# Create PersistentVolume with ReadWriteMany
+echo "Creating PersistentVolume..."
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: $PV_NAME
+spec:
+  capacity:
+    storage: $STORAGE_SIZE
+  accessModes:
+    - ReadWriteMany
+  hostPath:
+    path: /templates
+    type: DirectoryOrCreate
+  storageClassName: $STORAGE_CLASS
+EOF
+
+# Create PersistentVolumeClaim with ReadWriteMany
+echo "Creating PersistentVolumeClaim..."
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: $PVC_NAME
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: $STORAGE_SIZE
+  storageClassName: $STORAGE_CLASS
+EOF
+
+# Create temporary pod to clone and extract templates
+echo "Creating temporary pod to download templates..."
+cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: $TEMP_POD_NAME
+spec:
+  containers:
+  - name: template-setup
+    image: ubuntu:latest
+    command: ["/bin/bash", "-c"]
+    args:
+    - |
+      apt-get update && apt-get install -y git
+      git clone https://github.com/vllm-project/vllm.git /tmp/vllm
+      cp /tmp/vllm/examples/*.jinja /templates/
+      rm -rf /tmp/vllm
+    volumeMounts:
+    - name: templates-volume
+      mountPath: /templates
+  volumes:
+  - name: templates-volume
+    persistentVolumeClaim:
+      claimName: $PVC_NAME
+  restartPolicy: Never
+EOF
+
+# Wait for pod to complete
+echo "Waiting for template download to complete..."
+kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/vllm-templates-setup  --timeout=300s
+# Delete the temporary pod
+echo "Cleaning up temporary pod..."
+kubectl delete pod $TEMP_POD_NAME
+
+# Verify the setup
+echo "Verifying setup..."
+kubectl get pv $PV_NAME
+kubectl get pvc $PVC_NAME
+
+# List downloaded templates
+echo "Downloaded templates:"
+ls -l "$HOST_TEMPLATES_DIR"
+
+echo "Setup complete! The templates are now available in the PersistentVolume."
+echo "Host path: $HOST_TEMPLATES_DIR"
+echo "You can now deploy the vLLM stack with tool calling support."
diff --git a/src/examples/tool_calling_example.py b/src/examples/tool_calling_example.py
@@ -0,0 +1,63 @@
+import json
+
+import openai
+
+
+def get_weather(location: str, unit: str):
+    """Mock weather function for demonstration."""
+    return f"Getting the weather for {location} in {unit}..."
+
+
+def main():
+    # Configure OpenAI
+    openai.api_base = "http://localhost:8000/v1"
+    openai.api_key = "dummy"  # Not needed for local vLLM server
+
+    # Define the tools that the model can use
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g., 'San Francisco, CA'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The unit of temperature to use",
+                        },
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+
+    # Make a request to the model
+    response = openai.ChatCompletion.create(
+        model="meta-llama/Llama-3.1-8B-Instruct",  # Use the model we deployed
+        messages=[
+            {"role": "user", "content": "What's the weather like in San Francisco?"}
+        ],
+        tools=tools,
+        tool_choice="auto",
+    )
+
+    # Extract and process the tool call
+    tool_call = response.choices[0].message.tool_calls[0].function
+    print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
+
+    # Execute the tool with the provided arguments
+    result = get_weather(**json.loads(tool_call.arguments))
+    print(f"Result: {result}")
+
+
+if __name__ == "__main__":
+    main()