foundation-model-stack · bringlein · Oct 21, 2025 · Oct 15, 2025
diff --git a/NOTICES.md b/NOTICES.md
diff --git a/README.md b/README.md
@@ -1,5 +1,12 @@
 # vllm-triton-backend
 
+:information_source: This repository was used to develop the now community-maintained [Triton Backend in vLLM V1 (`triton_attn`)](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/triton_attn.py). We consider the testing and microbenchmark scripts as well as the development tools (UBI container, proton viewer) still useful (and also use it ourselves), but the latest triton attention kernels are now maintained and developed in vLLM: [`vllm/vllm/attention/ops/`](https://github.com/vllm-project/vllm/tree/main/vllm/attention/ops). The kernels contained in this repository `vllm-triton-backend/ibm-triton-lib` are only updated on an unregular basis. 
+We may archive this repository in the near future. 
+
+
+* * *
+
+
 This repo contains:
 
 - A Triton-only attention backend for vLLM, implemented as [vLLM platform plugin](https://docs.vllm.ai/en/latest/design/plugin_system.html), see [`ibm-triton-lib/ibm_triton_lib/backend`](./ibm-triton-lib/ibm_triton_lib/backend/). 

diff --git a/doc/anatomy_of_a_triton_attention_kernel_ibm.pdf b/doc/anatomy_of_a_triton_attention_kernel_ibm.pdf
diff --git a/scripts/offline_inference.py b/scripts/offline_inference.py
@@ -41,8 +41,7 @@
     from vllm.distributed import cleanup_dist_env_and_memory
 
     llm = LLM(
-        # model="/mnt/nvme5n1p1/zrlngl/fmaas/models/llama3.1-8b-instruct/",
-        model="/net/storage149/autofs/css22/nmg/models/hf/meta-llama/Llama-3.1-8B-Instruct/main/",
+        model="./models/hf/meta-llama/Llama-3.1-8B-Instruct/main/",
         # max_model_len=2048,
         # enforce_eager=True,
         enable_prefix_caching=False,

diff --git a/triton-dejavu b/triton-dejavu
diff --git a/vllm b/vllm
+3 −0		.whitesource
+1 −1		triton_dejavu/__init__.py
+63 −32		triton_dejavu/autotuner.py
+6 −3		triton_dejavu/cache_manager.py
+5 −2		triton_dejavu/dejavu_storage.py
+5 −0		triton_dejavu/dejavu_utilities.py
+33 −28		triton_dejavu/jit_cache.py
+79 −33		triton_dejavu/testing.py