diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 8f12748b68f3..2b25c954b5c5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -17,6 +17,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT command: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6439a315e327..203bb9c6f53e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -172,3 +172,8 @@ steps: commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: A100 status + gpu: a100 + commands: + - nvidia-smi diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 01f7ff1e0e2b..08146bf4454c 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -49,6 +49,51 @@ steps: command: bash .buildkite/run-cpu-test.sh {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" + agents: + queue: a100-queue + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: {{ docker_image }} + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + {% else %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} @@ -90,4 +135,5 @@ steps: {% endif %} volumes: - /dev/shm:/dev/shm + {% endif %} {% endfor %}