diff --git a/NOTICES.md b/NOTICES.md deleted file mode 100644 index 9b0d33945..000000000 --- a/NOTICES.md +++ /dev/null @@ -1,13 +0,0 @@ -NOTICES -============ - -Triton Deja-vu Copyright 2025 IBM Corporation - -This product includes software developed at [IBM Corporation](http://www.ibm.com/). - -IBM and the IBM logo are trademarks of International Business Machines -Corporation, registered in many jurisdictions worldwide. Other product and -service names might be trademarks of IBM or other companies. A current list -of IBM trademarks is available on [ibm.com/trademark](http://ibm.com/trademark). - - diff --git a/README.md b/README.md index a2b4ace93..f39a43ad2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,12 @@ # vllm-triton-backend +:information_source: This repository was used to develop the now community-maintained [Triton Backend in vLLM V1 (`triton_attn`)](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/triton_attn.py). We consider the testing and microbenchmark scripts as well as the development tools (UBI container, proton viewer) still useful (and also use it ourselves), but the latest triton attention kernels are now maintained and developed in vLLM: [`vllm/vllm/attention/ops/`](https://github.com/vllm-project/vllm/tree/main/vllm/attention/ops). The kernels contained in this repository `vllm-triton-backend/ibm-triton-lib` are only updated on an unregular basis. +We may archive this repository in the near future. + + +* * * + + This repo contains: - A Triton-only attention backend for vLLM, implemented as [vLLM platform plugin](https://docs.vllm.ai/en/latest/design/plugin_system.html), see [`ibm-triton-lib/ibm_triton_lib/backend`](./ibm-triton-lib/ibm_triton_lib/backend/). diff --git a/doc/anatomy_of_a_triton_attention_kernel_ibm.pdf b/doc/anatomy_of_a_triton_attention_kernel_ibm.pdf new file mode 100644 index 000000000..f11f989fc Binary files /dev/null and b/doc/anatomy_of_a_triton_attention_kernel_ibm.pdf differ diff --git a/scripts/offline_inference.py b/scripts/offline_inference.py index d29412d70..7f308039d 100644 --- a/scripts/offline_inference.py +++ b/scripts/offline_inference.py @@ -41,8 +41,7 @@ from vllm.distributed import cleanup_dist_env_and_memory llm = LLM( - # model="/mnt/nvme5n1p1/zrlngl/fmaas/models/llama3.1-8b-instruct/", - model="/net/storage149/autofs/css22/nmg/models/hf/meta-llama/Llama-3.1-8B-Instruct/main/", + model="./models/hf/meta-llama/Llama-3.1-8B-Instruct/main/", # max_model_len=2048, # enforce_eager=True, enable_prefix_caching=False, diff --git a/triton-dejavu b/triton-dejavu index c2555ce1a..814e6bef9 160000 --- a/triton-dejavu +++ b/triton-dejavu @@ -1 +1 @@ -Subproject commit c2555ce1a61d2288007366b2dcef1203ed1f26ee +Subproject commit 814e6bef9ef695e3384d2c9ebaf3babb9f1a4579 diff --git a/vllm b/vllm index d91278181..8f4b313c3 160000 --- a/vllm +++ b/vllm @@ -1 +1 @@ -Subproject commit d91278181d89686b73b2ec88c2db4d55c6c506cb +Subproject commit 8f4b313c3790844d2d6ec9aeaa6dd0825c94752e