diff --git a/helm/Chart.yaml b/helm/Chart.yaml index fb905ebc5..9616a22c2 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -15,20 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 - -keywords: - - vllm - - lora - - adapter -home: https://github.com/vllm-project/production-stack -sources: - - https://github.com/vllm-project/production-stack +version: 0.1.1 maintainers: - name: apostac - -# Specifies that CRDs should be created/updated first -annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" diff --git a/helm/templates/deployment-lora-controller.yaml b/helm/templates/deployment-lora-controller.yaml index c89d3143d..c01b4b742 100644 --- a/helm/templates/deployment-lora-controller.yaml +++ b/helm/templates/deployment-lora-controller.yaml @@ -1,4 +1,4 @@ -{{- if .Values.loraController.enabled -}} +{{- if and .Values.loraController .Values.loraController.enabled -}} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml index 7ca02007b..1aa831c9b 100644 --- a/helm/templates/deployment-vllm-multi.yaml +++ b/helm/templates/deployment-vllm-multi.yaml @@ -54,6 +54,7 @@ spec: containers: - name: "vllm" image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}" + command: - "vllm" - "serve" @@ -65,6 +66,13 @@ spec: {{- if $modelSpec.enableLoRA }} - "--enable-lora" {{- end }} + {{- if $modelSpec.enableTool }} + - "--enable-auto-tool-choice" + {{- end }} + {{- if $modelSpec.toolCallParser }} + - "--tool-call-parser" + - {{ $modelSpec.toolCallParser | quote }} + {{- end }} {{- with $modelSpec.vllmConfig }} {{- if hasKey . "enableChunkedPrefill" }} - "--enable-chunked-prefill" @@ -99,7 +107,7 @@ spec: {{- end }} {{- if $modelSpec.chatTemplate }} - "--chat-template" - - "/chat_templates/chat-template.jinga" + - {{ $modelSpec.chatTemplate | quote }} {{- end }} {{- if .Values.servingEngineSpec.containerSecurityContext }} securityContext: @@ -194,10 +202,6 @@ spec: - name: {{ .Release.Name }}-storage mountPath: /data {{- end }} - {{- if .Values.loraController.enabled }} - - name: shared-model-storage - mountPath: /models - {{- end }} {{- with $modelSpec.vllmConfig }} {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} - name: shm @@ -205,12 +209,12 @@ spec: {{- end}} {{- end}} {{- if $modelSpec.chatTemplate }} - - name: {{ .Release.Name }}-chat-templates - mountPath: /chat_templates - {{- end}} + - name: vllm-templates + mountPath: /templates + {{- end }} {{- if hasKey $modelSpec "extraVolumeMounts" }} {{- toYaml $modelSpec.extraVolumeMounts | nindent 10 }} - {{- end}} + {{- end }} {{- if $modelSpec.imagePullSecret }} imagePullSecrets: - name: {{ $modelSpec.imagePullSecret }} @@ -223,11 +227,6 @@ spec: persistentVolumeClaim: claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim" {{- end }} - {{- if .Values.loraController.enabled }} - - name: shared-model-storage - persistentVolumeClaim: - claimName: {{ .Release.Name }}-shared-storage-claim - {{- end }} {{- with $modelSpec.vllmConfig }} {{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}} - name: shm @@ -237,9 +236,15 @@ spec: {{- end}} {{- end}} {{- if $modelSpec.chatTemplate}} + {{- if hasKey $modelSpec "chatTemplateConfigMap" }} - name: {{ .Release.Name }}-chat-templates configMap: name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates" + {{- else }} + - name: vllm-templates + persistentVolumeClaim: + claimName: vllm-templates-pvc + {{- end }} {{- end}} {{- if hasKey $modelSpec "extraVolumes" }} {{- toYaml $modelSpec.extraVolumes | nindent 8 }} @@ -266,7 +271,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} {{- end }} -{{- if $modelSpec.chatTemplate }} +{{- if and $modelSpec.chatTemplate (hasKey $modelSpec "chatTemplateConfigMap") }} --- apiVersion: v1 kind: ConfigMap @@ -274,8 +279,8 @@ metadata: name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates" namespace: "{{ .Release.Namespace }}" data: - chat-template.jinga: |- - {{ $modelSpec.chatTemplate}} + {{ $modelSpec.chatTemplate }}: |- + {{ $modelSpec.chatTemplateConfigMap }} {{- end }} {{- end }} --- diff --git a/helm/templates/loraadapter-crd.yaml b/helm/templates/loraadapter-crd.yaml index c08d10dd6..172d985a5 100644 --- a/helm/templates/loraadapter-crd.yaml +++ b/helm/templates/loraadapter-crd.yaml @@ -1,4 +1,4 @@ -{{- if .Values.loraController.enabled -}} +{{- if and .Values.loraController .Values.loraController.enabled -}} --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition diff --git a/helm/templates/rbac-lora-controller.yaml b/helm/templates/rbac-lora-controller.yaml index 77c2a6e2a..0f4bf27d4 100644 --- a/helm/templates/rbac-lora-controller.yaml +++ b/helm/templates/rbac-lora-controller.yaml @@ -1,4 +1,4 @@ -{{- if .Values.loraController.enabled -}} +{{- if and .Values.loraController .Values.loraController.enabled -}} --- apiVersion: v1 kind: ServiceAccount diff --git a/helm/templates/shared-storage.yaml b/helm/templates/shared-storage.yaml index 688cfc637..0e769f110 100644 --- a/helm/templates/shared-storage.yaml +++ b/helm/templates/shared-storage.yaml @@ -1,4 +1,4 @@ -{{- if or .Values.sharedStorage.enabled .Values.loraController.enabled }} +{{- if or (and .Values.loraController .Values.loraController.enabled) (and .Values.sharedStorage .Values.sharedStorage.enabled) }} --- apiVersion: v1 kind: PersistentVolume diff --git a/helm/values.yaml b/helm/values.yaml index ab727ca6e..a8b23b979 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -329,41 +329,3 @@ routerSpec: # httpGet: # # -- Path to access on the HTTP server # - -loraController: - enabled: false - replicaCount: 1 - image: - repository: lmcache/lora-controller - tag: latest - pullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 100m - memory: 128Mi - podAnnotations: {} - podSecurityContext: {} - securityContext: {} - nodeSelector: {} - tolerations: [] - affinity: {} - extraEnv: [] - extraVolumes: [] - extraVolumeMounts: [] - imagePullSecrets: [] - -# Shared storage configuration for LoRA adapters -sharedStorage: - enabled: false - storageClass: "standard" - size: "100Gi" - accessModes: - - ReadWriteMany - # Use either hostPath or nfs configuration - hostPath: "" - nfs: - server: "" - path: "" diff --git a/scripts/setup_vllm_templates.sh b/scripts/setup_vllm_templates.sh new file mode 100755 index 000000000..4c72d4146 --- /dev/null +++ b/scripts/setup_vllm_templates.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# Exit on error +set -e + +# Configuration +PV_NAME="vllm-templates-pv" # Fixed name for template storage +PVC_NAME="vllm-templates-pvc" # Fixed name for template storage +STORAGE_SIZE="1Gi" +STORAGE_CLASS="standard" +HOST_TEMPLATES_DIR="/mnt/templates" # This is where the PV will be mounted on the host +TEMP_POD_NAME="vllm-templates-setup" + +echo "Setting up vLLM templates..." +echo "Using PV: $PV_NAME" +echo "Using PVC: $PVC_NAME" + +# Check if host directory exists and create if needed +echo "Checking host directory at $HOST_TEMPLATES_DIR..." +if [ ! -d "$HOST_TEMPLATES_DIR" ]; then + echo "Creating host directory at $HOST_TEMPLATES_DIR..." + sudo mkdir -p "$HOST_TEMPLATES_DIR" + sudo chmod 777 "$HOST_TEMPLATES_DIR" # Allow read/write access +fi + +# Verify directory permissions +if [ ! -w "$HOST_TEMPLATES_DIR" ]; then + echo "Error: No write permission to $HOST_TEMPLATES_DIR" + echo "Please ensure you have sudo access or the directory has proper permissions" + exit 1 +fi + +# Create PersistentVolume with ReadWriteMany +echo "Creating PersistentVolume..." +cat < **Note**: The tool calling configuration is now simplified: + +> - `enableTool: true` enables the feature +> - `toolCallParser`: specifies how the model's tool calls are parsed (using "llama3_json" for Llama-3 models) +> - `chatTemplate`: specifies the template file name (will be mounted at `/vllm/templates/`) + +> The chat templates are managed through a PersistentVolume that we created in step 1, which provides several benefits: + +> - Templates are downloaded once and stored persistently +> - Templates can be shared across multiple deployments +> - Templates can be updated by updating the files in the PersistentVolume +> - Templates are version controlled with the vLLM repository + +#### 3.2: Deploy the Helm Chart + +```bash +# Add the vLLM Helm repository if you haven't already +helm repo add vllm https://vllm-project.github.io/production-stack + +# Deploy the vLLM stack with tool calling support using the example configuration +helm install vllm-tool vllm/vllm-stack -f tutorials/assets/values-08-tool-enabled.yaml +``` + +The deployment will: + +1. Use the PersistentVolume we created in step 1 to access the templates +2. Mount the templates at `/vllm/templates` in the container +3. Configure the model to use the specified template for tool calling + +You can verify the deployment with: + +```bash +# Check the deployment status +kubectl get deployments + +# Check the pods +kubectl get pods + +# Check the logs +kubectl logs -f deployment/vllm-tool-llama3-8b-deployment-vllm +``` + +### 4. Test Tool Calling Setup + +Now that the deployment is running, let's test the tool calling functionality using the example script. + +#### 4.1: Port Forward the Router Service + +First, we need to set up port forwarding to access the router service: + +```bash +# Get the service name +kubectl get svc + +# Set up port forwarding to the router service +kubectl port-forward svc/vllm-tool-router-service 8000:80 +``` + +#### 4.2: Run the Example Script + +In a new terminal, run the example script to test tool calling: + +```bash +# Navigate to the examples directory +cd src/examples + +# Run the example script +python tool_calling_example.py +``` + +The script will: + +1. Connect to the vLLM service through the port-forwarded endpoint +2. Send a test query asking about the weather +3. Demonstrate the model's ability to: + - Understand the available tools + - Make appropriate tool calls + - Process the tool responses + +Expected output should look something like: + +```text +Function called: get_weather +Arguments: {"location": "San Francisco, CA", "unit": "celsius"} +Result: Getting the weather for San Francisco, CA in celsius... +``` + +This confirms that: + +1. The vLLM service is running correctly +2. Tool calling is properly enabled +3. The model can understand and use the defined tools +4. The template system is working as expected + +> **Note**: The example uses a mock weather function for demonstration. In a real application, you would replace this with actual API calls to weather services. diff --git a/tutorials/assets/values-09-tool-enabled.yaml b/tutorials/assets/values-09-tool-enabled.yaml new file mode 100644 index 000000000..e8390fedf --- /dev/null +++ b/tutorials/assets/values-09-tool-enabled.yaml @@ -0,0 +1,48 @@ +servingEngineSpec: + runtimeClassName: "" + modelSpec: + - name: "llama3-8b" + repository: "vllm/vllm-openai" + tag: "latest" + modelURL: "meta-llama/Llama-3.1-8B-Instruct" + + # Tool calling configuration + enableTool: true + toolCallParser: "llama3_json" # Parser to use for tool calls (e.g., "llama3_json" for Llama models) + chatTemplate: "/templates/tool_chat_template_llama3.1_json.jinja" # Full path to template file + + # Mount Hugging Face credentials and vLLM configuration + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-credentials + key: HUGGING_FACE_HUB_TOKEN + - name: VLLM_TEMPLATE_DIR + value: "/templates" + + replicaCount: 1 + + # Resource requirements for Llama-3.1-8B-Instruct + # resources: + # requests: + # cpu: 8 + # memory: "32Gi" + # nvidia.com/gpu: 1 + # limits: + # cpu: 8 + # memory: "32Gi" + # nvidia.com/gpu: 1 + requestCPU: 8 + requestMemory: "32Gi" + requestGPU: 1 + + # vLLM configuration + vllmConfig: + maxModelLen: 4096 + dtype: "bfloat16" + tensorParallelSize: 1 + +# Disable shared storage +# sharedStorage: +# enabled: false