vllm-project
diff --git a/‎.yapfignore‎
Lines changed: 1 addition & 0 deletions b/‎.yapfignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/models/supported_models.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 5 additions & 1 deletion b/‎tests/models/registry.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎vllm/config/__init__.py‎
Lines changed: 46 additions & 10 deletions b/‎vllm/config/__init__.py‎
Lines changed: 46 additions & 10 deletions
diff --git a/‎vllm/config/compilation.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/config/compilation.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/model_executor/layers/fla/ops/chunk_delta_h.py‎
Lines changed: 3 additions & 2 deletions b/‎vllm/model_executor/layers/fla/ops/chunk_delta_h.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fla/ops/chunk_o.py‎
Lines changed: 5 additions & 4 deletions b/‎vllm/model_executor/layers/fla/ops/chunk_o.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py‎
Lines changed: 6 additions & 4 deletions b/‎vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎vllm/model_executor/layers/fla/ops/fused_recurrent.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fla/ops/fused_recurrent.py‎
Lines changed: 2 additions & 2 deletions
@@ -1 +1,2 @@
 collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
@@ -403,6 +403,7 @@ th {
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Qwen3NextForCausalLM` | Qwen3.5MoE | `Qwen/Qwen3-Next-80B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `SeedOssForCausalLM` | SeedOss | `ByteDance-Seed/Seed-OSS-36B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
 
@@ -228,6 +228,7 @@ fo = "fo"
 ba = "ba"
 
 [tool.typos.type.py.extend-words]
+ba = "ba"
 
 [tool.typos.type.cpp]
 extend-glob = ["*.cu"]
 
@@ -326,6 +326,8 @@ def check_available_online(
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
     "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                            min_transformers_version="4.56.2"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
     "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                           trust_remote_code=True,
@@ -640,7 +642,9 @@ def check_available_online(
                                         is_available_online=False),
     "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                     trust_remote_code=True,
-                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
+    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                     min_transformers_version="4.56.2"),
 }
 
 _TRANSFORMERS_BACKEND_MODELS = {
 
@@ -1508,7 +1508,8 @@ def get_layers_start_end_indices(
         if (self.hf_text_config.model_type == "deepseek_mtp"
                 or self.hf_config.model_type == "mimo_mtp"
                 or self.hf_config.model_type == "glm4_moe_mtp"
-                or self.hf_config.model_type == "ernie_mtp"):
+                or self.hf_config.model_type == "ernie_mtp"
+                or self.hf_config.model_type == "qwen3_next_mtp"):
             total_num_hidden_layers = getattr(self.hf_text_config,
                                               "num_nextn_predict_layers", 0)
         else:
@@ -1571,15 +1572,28 @@ def get_num_layers_by_block_type(
             if attn_type_list:
                 return sum(t == 1 for t in attn_type_list[start:end])
 
-            if layers_block_type_value is None and attn_type_list is None:
+            # Hybrid model Qwen3Next
+            layer_types_value = getattr(self.hf_config, "layer_types", None)
+            if layer_types_value is not None:
+                if getattr(block_type, "value", block_type) == "attention":
+                    return sum(t == "full_attention"
+                               for t in layer_types_value[start:end])
+                elif getattr(block_type, "value",
+                             block_type) == "linear_attention":
+                    return sum(t == "linear_attention"
+                               for t in layer_types_value[start:end])
+                else:
+                    return sum(t == getattr(block_type, "value", block_type)
+                               for t in layer_types_value[start:end])
+
+            if (layers_block_type_value is None and attn_type_list is None
+                    and layer_types_value is None):
                 raise ValueError(
                     "The model is an hybrid without a"
-                    "layers_block_type or an attn_type_list in the hf_config,"
-                    "cannot determine the num of "
+                    "layers_block_type or an attn_type_list, or a layer_types "
+                    "in the hf_config, cannot determine the num of "
                     f"{block_type.value} layers")
 
-            return sum(t == 1 for t in attn_type_list[start:end])
-
     def get_mamba_chunk_size(self) -> Optional[int]:
         """
         Returns the mamba chunk size if it exists
@@ -1866,7 +1880,7 @@ def __post_init__(self):
 
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                             "mlp_speculator", "draft_model", "deepseek_mtp",
-                            "ernie_mtp"]
+                            "ernie_mtp", "qwen3_next_mtp"]
 
 
 @config
@@ -2007,7 +2021,15 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 "n_predict": n_predict,
                 "architectures": ["ErnieMTPModel"]
             })
-            return hf_config
+
+        if hf_config.model_type == "qwen3_next":
+            hf_config.model_type = "qwen3_next_mtp"
+        if hf_config.model_type == "qwen3_next_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["Qwen3NextMTP"]
+            })
 
         return hf_config
 
@@ -2028,9 +2050,13 @@ def __post_init__(self):
                 (self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3" or
                     self.target_model_config.hf_text_config.model_type in
-                        ("mimo","ernie4_5_moe")):
+                        ("mimo","ernie4_5_moe", "qwen3_next")):
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
+                # Align the quantization of draft model for cases such as
+                # --quantization fp8 with a bf16 checkpoint.
+                if not self.quantization:
+                    self.quantization = self.target_model_config.quantization
             elif self.method in ("ngram", "[ngram]"):
                 self.model = "ngram"
             else:
@@ -2140,6 +2166,15 @@ def __post_init__(self):
                                 "one layer. Might need some code changes " \
                                 "to support multiple layers."
                             )
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "qwen3_next_mtp"):
+                    self.method = "qwen3_next_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Qwen3Next MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                 else:
                     self.method = "draft_model"
                     raise NotImplementedError(
@@ -2355,7 +2390,8 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
+        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
+                               "qwen3_next_mtp")
 
     def __repr__(self) -> str:
         method = self.method
 
@@ -341,6 +341,7 @@ class CompilationConfig:
         "vllm.short_conv",
         "vllm.linear_attention",
         "vllm.plamo2_mamba_mixer",
+        "vllm.gdn_attention",
     ]
 
     def compute_hash(self) -> str:
 
@@ -14,7 +14,7 @@
 from vllm.triton_utils import tl, triton
 
 from .index import prepare_chunk_indices, prepare_chunk_offsets
-from .op import exp, safe_exp
+from .op import exp
 from .utils import is_nvidia_hopper, use_cuda_graph
 
 NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
@@ -175,12 +175,13 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
                      boundary_check=(0, 1))
 
         if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
             last_idx = min((i_t + 1) * BT, T) - 1
             b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
             p_g = tl.make_block_ptr(g + bos * H + i_h, (T, ), (H, ),
                                     (i_t * BT, ), (BT, ), (0, ))
             b_g = tl.load(p_g, boundary_check=(0, ))
-            b_v_new = b_v_new * safe_exp(b_g_last - b_g)[:, None]
+            b_v_new = b_v_new * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
             b_g_last = exp(b_g_last)
             b_h1 = b_h1 * b_g_last
             if K > 64:
 
@@ -16,7 +16,7 @@
 from vllm.triton_utils import tl, triton
 
 from .index import prepare_chunk_indices
-from .op import exp, safe_exp
+from .op import exp
 from .utils import FLA_GDN_FIX_BT, check_shared_mem, is_nvidia_hopper
 
 BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
@@ -112,10 +112,11 @@ def chunk_fwd_kernel_o(
         p_g = tl.make_block_ptr(g, (T, ), (H, ), (i_t * BT, ), (BT, ), (0, ))
         b_g = tl.load(p_g, boundary_check=(0, ))
         b_o = b_o * exp(b_g)[:, None]
-        b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :])
+        b_A = b_A * exp(b_g[:, None] - b_g[None, :])
 
-    o_i = tl.arange(0, BT)
-    m_A = o_i[:, None] >= o_i[None, :]
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] >= o_t[None, :]) & (m_t[:, None] & m_t)
     b_A = tl.where(m_A, b_A, 0)
 
     p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV),
 
@@ -14,7 +14,7 @@
 from vllm.triton_utils import tl, triton
 
 from .index import prepare_chunk_indices
-from .op import safe_exp
+from .op import exp
 
 
 @triton.heuristics({
@@ -56,7 +56,8 @@ def chunk_scaled_dot_kkt_fwd_kernel(
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
-    o_t = tl.arange(0, BT)
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
 
     p_beta = tl.make_block_ptr(beta + bos * H + i_h, (T, ), (H, ),
                                (i_t * BT, ), (BT, ), (0, ))
@@ -76,9 +77,10 @@ def chunk_scaled_dot_kkt_fwd_kernel(
                                 (i_t * BT, ), (BT, ), (0, ))
         b_g = tl.load(p_g, boundary_check=(0, ))
         b_g_diff = b_g[:, None] - b_g[None, :]
-        b_A = b_A * safe_exp(b_g_diff)
+        b_A = b_A * exp(b_g_diff)
 
-    b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0)
+    m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0)
     p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1),
                             (i_t * BT, 0), (BT, BT), (1, 0))
     tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
 
@@ -116,8 +116,8 @@ def fused_recurrent_gated_delta_rule_fwd_kernel(
         b_g = tl.load(p_g).to(tl.float32)
 
         if USE_QK_L2NORM_IN_KERNEL:
-            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q)) + 1e-6)
-            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k)) + 1e-6)
+            b_q = b_q / tl.sqrt(tl.sum(b_q * b_q) + 1e-6)
+            b_k = b_k / tl.sqrt(tl.sum(b_k * b_k) + 1e-6)
         b_q = b_q * scale
         # [BK, BV]
         b_h *= exp(b_g)
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`collect_env.py`
	`2`	`+vllm/model_executor/layers/fla/ops/*.py`
Original file line number	Diff line number	Diff line change
`@@ -341,6 +341,7 @@ class CompilationConfig:`
`341`	`341`	`"vllm.short_conv",`
`342`	`342`	`"vllm.linear_attention",`
`343`	`343`	`"vllm.plamo2_mamba_mixer",`
	`344`	`+ "vllm.gdn_attention",`
`344`	`345`	`]`
`345`	`346`
`346`	`347`	`def compute_hash(self) -> str:`