fix typo and cmake config

jinzhen-lin · jinzhen-lin · commit 28d1c497db92 · 2025-02-28T13:21:14.000+08:00
Signed-off-by: Jinzhen Lin &lt;linjinzhen@hotmail.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -522,6 +522,13 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(VLLM_MOE_WNA16_SRC
+    "csrc/moe/moe_wna16.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_MOE_WNA16_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -954,11 +954,11 @@ def get_default_config(
             "num_warps": 4,
             "num_stages": 3,
         }
-    elif dtype in ["int4_w8a16", "int8_w8a16"] and block_shape is not None:
+    elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None:
         # moe wna16 kernels
         # only set BLOCK_SIZE_M
         # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later
-        bit = 4 if dtype == "int4_w8a16" else 8
+        bit = 4 if dtype == "int4_w4a16" else 8
         use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk,
                                                        block_shape[1], E, bit)
         if use_moe_wna16_cuda:
@@ -1003,7 +1003,7 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
-        if dtype == "int4_w8a16":
+        if dtype == "int4_w4a16":
             N = N * 2
         block_n = block_shape[0] if block_shape else 0
         block_k = block_shape[1] if block_shape else 0
@@ -1125,7 +1125,7 @@ def get_config_dtype_str(dtype: torch.dtype,
     elif use_int8_w8a16:
         return "int8_w8a16"
     elif use_int4_w4a16:
-        return "int4_w8a16"
+        return "int4_w4a16"
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs