Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"

FROM $BASE_IMAGE

ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"

RUN echo "Base image is $BASE_IMAGE"

# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

# Tested and supported base rocm/pytorch images
ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
ROCM_6_1_BASE="rocm/pytorch:rocm6.1_ubuntu22.04_py3.10_pytorch_2.1.2"

ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
Expand Down Expand Up @@ -68,15 +69,15 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
&& if [ "$BASE_IMAGE" = "$ROCm_5_7_BASE" ]; then \
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
&& python3 setup.py install \
&& cd ..; \
fi

# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
RUN if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

# build triton
Expand Down Expand Up @@ -107,11 +108,11 @@ ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& if [ "$BASE_IMAGE" = "$ROCm_6_0_BASE" ]; then \
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch; fi \
&& python3 setup.py install \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
&& cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
&& export VLLM_PYTHON_VERSION=$(python -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))") \
&& cp build/lib.linux-x86_64-cpython-${VLLM_PYTHON_VERSION}/vllm/*.so vllm/ \
&& cd ..


Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/

*Latest News* 🔥
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/06] Added ROCm 6.1 support to vLLM.
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM.
Expand Down
5 changes: 4 additions & 1 deletion cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,11 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
# Find the intersection of the supported + detected architectures to
# set the module architecture flags.
#

set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")

set(${GPU_ARCHES})
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
foreach (_ARCH ${ROCM_SUPPORTED_ARCHS})
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
list(APPEND ${GPU_ARCHES} ${_ARCH})
endif()
Expand Down
15 changes: 11 additions & 4 deletions docs/source/getting_started/amd-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Requirements
* OS: Linux
* Python: 3.8 -- 3.11
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
* ROCm 6.0 and ROCm 5.7
* ROCm 6.1, 6.0 and ROCm 5.7

Installation options:

Expand All @@ -27,7 +27,7 @@ You can build and install vLLM from source.

First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.

`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 6.0 and ROCm 5.7.
It provides flexibility to customize the build of docker image using the following arguments:

* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
Expand All @@ -39,18 +39,25 @@ It provides flexibility to customize the build of docker image using the followi
Their values can be passed in when running ``docker build`` with ``--build-arg`` options.


To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:

.. code-block:: console

$ docker build -f Dockerfile.rocm -t vllm-rocm .

To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:

.. code-block:: console

$ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .

To build docker image for vllm on ROCm 6.0, you can specify ``BASE_IMAGE`` as below:

.. code-block:: console

$ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
-f Dockerfile.rocm -t vllm-rocm .

To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:

.. code-block:: console
Expand Down
5 changes: 4 additions & 1 deletion tests/distributed/test_basic_distributed_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pytest
import torch

from vllm.utils import is_hip

MODELS = [
os.environ["TEST_DIST_MODEL"],
]
Expand All @@ -40,7 +42,8 @@ def test_models(
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)

backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
enforce_eager = backend_by_env_var == "FLASHINFER"
# TODO revisit after hipgraph is fully supported on multi-gpus
enforce_eager = is_hip() or backend_by_env_var == "FLASHINFER"

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
Expand Down
3 changes: 3 additions & 0 deletions tests/distributed/test_chunked_prefill_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import pytest
import torch

from vllm.utils import is_hip

MODELS = [
os.environ["TEST_DIST_MODEL"],
]
Expand Down Expand Up @@ -52,6 +54,7 @@ def test_models(
model,
dtype=dtype,
tensor_parallel_size=2,
enforce_eager=is_hip(),
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
Expand Down
21 changes: 21 additions & 0 deletions tests/quantization/test_compressed_tensors.py
Copy link
Contributor

@mawong-amd mawong-amd Jun 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The quantization test errors are unrelated to ROCm. We should not unilaterally skip tests that are also broken in upstream: when these are fixed we will lose tests for no reason.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(1) I got this error when I ran this test: "ValueError: compressed-tensors quantization is currently not supported in ROCm.".
ROCm did not support it right now.
(2) we should revisit the test if this is supported by ROCm. We will need to borrow pytorch practice eventually to book keep all the skip tests and do parity review periodically.

Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@
CompressedTensorsLinearMethod, CompressedTensorsW4A16,
CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8DynamicToken,
CompressedTensorsW8A8StaticTensor)
from vllm.utils import is_hip


def test_compressed_tensors_w8a8_static_setup(vllm_runner):
if is_hip():
pytest.skip(
"compressed-tensors quantization currently not supported in ROCm.")

model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
with vllm_runner(model_path, enforce_eager=True) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
Expand Down Expand Up @@ -43,6 +48,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):


def test_compressed_tensors_no_enforce_eager(vllm_runner):
if is_hip():
pytest.skip(
"compressed-tensors quantization currently not supported in ROCm.")

model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
with vllm_runner(model_path) as llm:
sampling_params = SamplingParams()
Expand All @@ -51,6 +60,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):


def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
if is_hip():
pytest.skip(
"compressed-tensors quantization currently not supported in ROCm.")

model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
with vllm_runner(model_path, dtype=torch.float16) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
Expand All @@ -68,6 +81,10 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128),
])
def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):
if is_hip():
pytest.skip(
"compressed-tensors quantization currently not supported in ROCm.")

model, strategy, group = w4a16_args
with vllm_runner(model) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
Expand All @@ -86,6 +103,10 @@ def test_compressed_tensors_w4a16(vllm_runner, w4a16_args):


def test_compressed_tensors_w4a16_marlin24(vllm_runner):
if is_hip():
pytest.skip(
"compressed-tensors quantization currently not supported in ROCm.")

model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
with vllm_runner(model_path) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
Expand Down
3 changes: 3 additions & 0 deletions tests/test_sharded_state_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from vllm import LLM, SamplingParams
from vllm.model_executor.model_loader.loader import ShardedStateLoader
from vllm.utils import is_hip

prompts = [
"Hello, my name is",
Expand Down Expand Up @@ -106,6 +107,7 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
enable_lora=enable_lora,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,
enforce_eager=is_hip(),
))
p.start()
p.join()
Expand All @@ -119,6 +121,7 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,
load_format="sharded_state",
enforce_eager=is_hip(),
))
p.start()
p.join()
Expand Down