Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 1 addition & 14 deletions helm/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

keywords:
- vllm
- lora
- adapter
home: https://github.com/vllm-project/production-stack
sources:
- https://github.com/vllm-project/production-stack
version: 0.1.1

maintainers:
- name: apostac

# Specifies that CRDs should be created/updated first
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
2 changes: 1 addition & 1 deletion helm/templates/deployment-lora-controller.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if .Values.loraController.enabled -}}
{{- if and .Values.loraController .Values.loraController.enabled -}}
apiVersion: apps/v1
kind: Deployment
metadata:
Expand Down
39 changes: 22 additions & 17 deletions helm/templates/deployment-vllm-multi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ spec:
containers:
- name: "vllm"
image: "{{ required "Required value 'modelSpec.repository' must be defined !" $modelSpec.repository }}:{{ required "Required value 'modelSpec.tag' must be defined !" $modelSpec.tag }}"

command:
- "vllm"
- "serve"
Expand All @@ -65,6 +66,13 @@ spec:
{{- if $modelSpec.enableLoRA }}
- "--enable-lora"
{{- end }}
{{- if $modelSpec.enableTool }}
- "--enable-auto-tool-choice"
{{- end }}
{{- if $modelSpec.toolCallParser }}
- "--tool-call-parser"
- {{ $modelSpec.toolCallParser | quote }}
{{- end }}
{{- with $modelSpec.vllmConfig }}
{{- if hasKey . "enableChunkedPrefill" }}
- "--enable-chunked-prefill"
Expand Down Expand Up @@ -99,7 +107,7 @@ spec:
{{- end }}
{{- if $modelSpec.chatTemplate }}
- "--chat-template"
- "/chat_templates/chat-template.jinga"
- {{ $modelSpec.chatTemplate | quote }}
{{- end }}
{{- if .Values.servingEngineSpec.containerSecurityContext }}
securityContext:
Expand Down Expand Up @@ -194,23 +202,19 @@ spec:
- name: {{ .Release.Name }}-storage
mountPath: /data
{{- end }}
{{- if .Values.loraController.enabled }}
- name: shared-model-storage
mountPath: /models
{{- end }}
{{- with $modelSpec.vllmConfig }}
{{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
- name: shm
mountPath: /dev/shm
{{- end}}
{{- end}}
{{- if $modelSpec.chatTemplate }}
- name: {{ .Release.Name }}-chat-templates
mountPath: /chat_templates
{{- end}}
- name: vllm-templates
mountPath: /templates
{{- end }}
{{- if hasKey $modelSpec "extraVolumeMounts" }}
{{- toYaml $modelSpec.extraVolumeMounts | nindent 10 }}
{{- end}}
{{- end }}
{{- if $modelSpec.imagePullSecret }}
imagePullSecrets:
- name: {{ $modelSpec.imagePullSecret }}
Expand All @@ -223,11 +227,6 @@ spec:
persistentVolumeClaim:
claimName: "{{ .Release.Name }}-{{$modelSpec.name}}-storage-claim"
{{- end }}
{{- if .Values.loraController.enabled }}
- name: shared-model-storage
persistentVolumeClaim:
claimName: {{ .Release.Name }}-shared-storage-claim
{{- end }}
{{- with $modelSpec.vllmConfig }}
{{- if hasKey $modelSpec.vllmConfig "tensorParallelSize"}}
- name: shm
Expand All @@ -237,9 +236,15 @@ spec:
{{- end}}
{{- end}}
{{- if $modelSpec.chatTemplate}}
{{- if hasKey $modelSpec "chatTemplateConfigMap" }}
- name: {{ .Release.Name }}-chat-templates
configMap:
name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
{{- else }}
- name: vllm-templates
persistentVolumeClaim:
claimName: vllm-templates-pvc
{{- end }}
{{- end}}
{{- if hasKey $modelSpec "extraVolumes" }}
{{- toYaml $modelSpec.extraVolumes | nindent 8 }}
Expand All @@ -266,16 +271,16 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- end }}
{{- if $modelSpec.chatTemplate }}
{{- if and $modelSpec.chatTemplate (hasKey $modelSpec "chatTemplateConfigMap") }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: "{{ .Release.Name }}-{{$modelSpec.name}}-chat-templates"
namespace: "{{ .Release.Namespace }}"
data:
chat-template.jinga: |-
{{ $modelSpec.chatTemplate}}
{{ $modelSpec.chatTemplate }}: |-
{{ $modelSpec.chatTemplateConfigMap }}
{{- end }}
{{- end }}
---
Expand Down
2 changes: 1 addition & 1 deletion helm/templates/loraadapter-crd.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if .Values.loraController.enabled -}}
{{- if and .Values.loraController .Values.loraController.enabled -}}
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
Expand Down
2 changes: 1 addition & 1 deletion helm/templates/rbac-lora-controller.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if .Values.loraController.enabled -}}
{{- if and .Values.loraController .Values.loraController.enabled -}}
---
apiVersion: v1
kind: ServiceAccount
Expand Down
2 changes: 1 addition & 1 deletion helm/templates/shared-storage.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if or .Values.sharedStorage.enabled .Values.loraController.enabled }}
{{- if or (and .Values.loraController .Values.loraController.enabled) (and .Values.sharedStorage .Values.sharedStorage.enabled) }}
---
apiVersion: v1
kind: PersistentVolume
Expand Down
38 changes: 0 additions & 38 deletions helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -329,41 +329,3 @@ routerSpec:
# httpGet:
# # -- Path to access on the HTTP server
#

loraController:
enabled: false
replicaCount: 1
image:
repository: lmcache/lora-controller
tag: latest
pullPolicy: Always
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 100m
memory: 128Mi
podAnnotations: {}
podSecurityContext: {}
securityContext: {}
nodeSelector: {}
tolerations: []
affinity: {}
extraEnv: []
extraVolumes: []
extraVolumeMounts: []
imagePullSecrets: []

# Shared storage configuration for LoRA adapters
sharedStorage:
enabled: false
storageClass: "standard"
size: "100Gi"
accessModes:
- ReadWriteMany
# Use either hostPath or nfs configuration
hostPath: ""
nfs:
server: ""
path: ""
113 changes: 113 additions & 0 deletions scripts/setup_vllm_templates.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/bin/bash

# Exit on error
set -e

# Configuration
PV_NAME="vllm-templates-pv" # Fixed name for template storage
PVC_NAME="vllm-templates-pvc" # Fixed name for template storage
STORAGE_SIZE="1Gi"
STORAGE_CLASS="standard"
HOST_TEMPLATES_DIR="/mnt/templates" # This is where the PV will be mounted on the host
TEMP_POD_NAME="vllm-templates-setup"

echo "Setting up vLLM templates..."
echo "Using PV: $PV_NAME"
echo "Using PVC: $PVC_NAME"

# Check if host directory exists and create if needed
echo "Checking host directory at $HOST_TEMPLATES_DIR..."
if [ ! -d "$HOST_TEMPLATES_DIR" ]; then
echo "Creating host directory at $HOST_TEMPLATES_DIR..."
sudo mkdir -p "$HOST_TEMPLATES_DIR"
sudo chmod 777 "$HOST_TEMPLATES_DIR" # Allow read/write access
fi

# Verify directory permissions
if [ ! -w "$HOST_TEMPLATES_DIR" ]; then
echo "Error: No write permission to $HOST_TEMPLATES_DIR"
echo "Please ensure you have sudo access or the directory has proper permissions"
exit 1
fi

# Create PersistentVolume with ReadWriteMany
echo "Creating PersistentVolume..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolume
metadata:
name: $PV_NAME
spec:
capacity:
storage: $STORAGE_SIZE
accessModes:
- ReadWriteMany
hostPath:
path: /templates
type: DirectoryOrCreate
storageClassName: $STORAGE_CLASS
EOF

# Create PersistentVolumeClaim with ReadWriteMany
echo "Creating PersistentVolumeClaim..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $PVC_NAME
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: $STORAGE_SIZE
storageClassName: $STORAGE_CLASS
EOF

# Create temporary pod to clone and extract templates
echo "Creating temporary pod to download templates..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: $TEMP_POD_NAME
spec:
containers:
- name: template-setup
image: ubuntu:latest
command: ["/bin/bash", "-c"]
args:
- |
apt-get update && apt-get install -y git
git clone https://github.com/vllm-project/vllm.git /tmp/vllm
cp /tmp/vllm/examples/*.jinja /templates/
rm -rf /tmp/vllm
volumeMounts:
- name: templates-volume
mountPath: /templates
volumes:
- name: templates-volume
persistentVolumeClaim:
claimName: $PVC_NAME
restartPolicy: Never
EOF

# Wait for pod to complete
echo "Waiting for template download to complete..."
kubectl wait --for=jsonpath='{.status.phase}'=Succeeded pod/vllm-templates-setup --timeout=300s
# Delete the temporary pod
echo "Cleaning up temporary pod..."
kubectl delete pod $TEMP_POD_NAME

# Verify the setup
echo "Verifying setup..."
kubectl get pv $PV_NAME
kubectl get pvc $PVC_NAME

# List downloaded templates
echo "Downloaded templates:"
ls -l "$HOST_TEMPLATES_DIR"

echo "Setup complete! The templates are now available in the PersistentVolume."
echo "Host path: $HOST_TEMPLATES_DIR"
echo "You can now deploy the vLLM stack with tool calling support."
63 changes: 63 additions & 0 deletions src/examples/tool_calling_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json

import openai


def get_weather(location: str, unit: str):
"""Mock weather function for demonstration."""
return f"Getting the weather for {location} in {unit}..."


def main():
# Configure OpenAI
openai.api_base = "http://localhost:8000/v1"
openai.api_key = "dummy" # Not needed for local vLLM server

# Define the tools that the model can use
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and state, e.g., 'San Francisco, CA'",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The unit of temperature to use",
},
},
"required": ["location", "unit"],
},
},
}
]

# Make a request to the model
response = openai.ChatCompletion.create(
model="meta-llama/Llama-3.1-8B-Instruct", # Use the model we deployed
messages=[
{"role": "user", "content": "What's the weather like in San Francisco?"}
],
tools=tools,
tool_choice="auto",
)

# Extract and process the tool call
tool_call = response.choices[0].message.tool_calls[0].function
print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}")

# Execute the tool with the provided arguments
result = get_weather(**json.loads(tool_call.arguments))
print(f"Result: {result}")


if __name__ == "__main__":
main()
Loading