diff --git a/deploy/components/inference-gateway/inferencemodel.yaml b/deploy/components/inference-gateway/inferencemodel.yaml new file mode 100644 index 000000000..f729407e9 --- /dev/null +++ b/deploy/components/inference-gateway/inferencemodel.yaml @@ -0,0 +1,9 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: food-review +spec: + modelName: food-review + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct diff --git a/deploy/components/inference-gateway/inferencepool.yaml b/deploy/components/inference-gateway/inferencepool.yaml new file mode 100644 index 000000000..76e19fb4d --- /dev/null +++ b/deploy/components/inference-gateway/inferencepool.yaml @@ -0,0 +1,141 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + labels: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-epp +spec: + selector: + app: vllm-llama3-8b-instruct-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp +# image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: gateway-api-inference-extension/epp:latest + imagePullPolicy: IfNotPresent + args: + - -refreshMetricsInterval + - "500ms" + - -poolName + - "vllm-llama3-8b-instruct" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: + - "inference.networking.x-k8s.io" + resources: + - "inferencemodels" + verbs: + - "get" + - "watch" + - "list" +- apiGroups: + - "" + resources: + - "pods" + verbs: + - "get" + - "watch" + - "list" +- apiGroups: + - "inference.networking.x-k8s.io" + resources: + - "inferencepools" + verbs: + - "get" + - "watch" + - "list" +- apiGroups: + - "discovery.k8s.io" + resources: + - "endpointslices" + verbs: + - "get" + - "watch" + - "list" +- apiGroups: + - "authentication.k8s.io" + resources: + - "tokenreviews" + verbs: + - "create" +- apiGroups: + - "authorization.k8s.io" + resources: + - "subjectaccessreviews" + verbs: + - "create" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: pod-read diff --git a/deploy/components/vllm-sim/deployments.yaml b/deploy/components/vllm-sim/deployments.yaml index e7c981cfa..16a299f6e 100644 --- a/deploy/components/vllm-sim/deployments.yaml +++ b/deploy/components/vllm-sim/deployments.yaml @@ -1,86 +1,29 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-30801 - labels: - app: vllm-30801 -spec: - replicas: 1 - selector: - matchLabels: - app: vllm-30801 - template: - metadata: - labels: - app: vllm-30801 - ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30801 - spec: - containers: - - name: vllm - image: vllm-sim/vllm-sim:latest - args: - - "--port=30801" - - "--model=model1" - - "--lora=lora1,lora2" - ports: - - containerPort: 30801 ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-30802 - labels: - app: vllm-30802 -spec: - replicas: 1 - selector: - matchLabels: - app: vllm-30802 - template: - metadata: - labels: - app: vllm-30802 - ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30802 - spec: - containers: - - name: vllm - image: vllm-sim/vllm-sim:latest - args: - - "--port=30802" - - "--model=model1" - - "--lora=lora1,lora2" - ports: - - containerPort: 30802 --- apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-30803 + name: vllm-sim labels: - app: vllm-30803 + app: vllm-llama3-8b-instruct spec: replicas: 1 selector: matchLabels: - app: vllm-30803 + app: vllm-llama3-8b-instruct template: metadata: labels: - app: vllm-30803 + app: vllm-llama3-8b-instruct ai-aware-router-pod: "true" - annotations: - ai-aware-router-address: 127.0.0.1:30803 spec: containers: - name: vllm - image: vllm-sim/vllm-sim:latest + image: quay.io/vllm-d/vllm-sim:0.0.1 + imagePullPolicy: IfNotPresent args: - - "--port=30803" - - "--model=model2" - - "--lora=lora3" + - "--port=8000" + - "--model=food-review" + # - "--lora=lora10,lora20,lora30" + # - "--time-to-first-token=500" ports: - - containerPort: 30803 + - containerPort: 8000