From a344e798f6e65efa6a7cba5fd26955ea7d5bf82c Mon Sep 17 00:00:00 2001
From: yingxudeng <dengyingxu1@jd.com>
Date: Mon, 24 Nov 2025 17:30:23 +0800
Subject: [PATCH 1/3] feat: enhance Qwen3-MoE to support TP settings beyond 4.

---
 .../npu/npu_qwen3_moe_decoder_layer_impl.cpp  | 34 ++++++++++++++-----
 .../npu/npu_qwen3_moe_decoder_layer_impl.h    |  1 +
 2 files changed, 27 insertions(+), 8 deletions(-)
diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
index c2a840f91..5ba88dc32 100755
--- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
@@ -233,6 +233,8 @@ NpuQwen3MoeDecoderLayerImpl::NpuQwen3MoeDecoderLayerImpl(
   CHECK_EQ(parallel_args.world_size(), dp_size_ * dp_local_tp_size_);
   dp_local_tp_rank_ = parallel_args.rank() % dp_local_tp_size_;
 
+  n_kv_heads_ = static_cast<int32_t>(model_args.n_kv_heads().value());
+
   param_from_args(prefill_param_, model_args, parallel_args, true);
   param_from_args(decode_param_, model_args, parallel_args, false);
   initialize_tensors(options);
@@ -345,8 +347,8 @@ void NpuQwen3MoeDecoderLayerImpl::initialize_basic_parameters(
   param.rmsnormQKNorm = true;
   param.hiddenSizePerAttentionHead = args.head_dim();
   std::optional<long int> optionalValue = args.n_kv_heads();
-  param.numKeyValueHeadsPerRank =
-      static_cast<int>(optionalValue.value()) / parallel_args.world_size();
+  param.numKeyValueHeadsPerRank = std::max(
+      1, static_cast<int>(optionalValue.value()) / parallel_args.world_size());
   param.numAttentionHeadsPerRank = args.n_heads() / dp_local_tp_size_;
 
   param.attnLinearTransposeType = {1, -1, -1, 1, -1, -1};
@@ -390,8 +392,15 @@ void NpuQwen3MoeDecoderLayerImpl::initialize_mlp_parameters(
 void NpuQwen3MoeDecoderLayerImpl::initialize_parallel_parameters(
     atb_speed::qwen::MoeDecoderLayerParam& param,
     const ParallelArgs& parallel_args) {
-  param.lmHeadLocalTp = 0;
+  param.lmHeadLocalTp = dp_local_tp_size_;
   param.mapping = parallel_args.mapping();
+  param.tensorParallelInfo = {parallel_args.rank(),
+                              parallel_args.world_size(),
+                              FLAGS_communication_backend,
+                              FLAGS_rank_tablefile,
+                              nullptr,
+                              ""};
+
   param.maxDecodeDpTokenSize = 0;  // TODO
 }
 
@@ -543,13 +552,22 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights(
   const int index = get_mapped_index(name, weight_mapping);
   const bool is_sharded = shard_map.count(index);
   torch::Tensor tmp_tensor;
+  int32_t tp_rank = dp_local_tp_rank_;
+  int32_t tp_size = dp_local_tp_size_;
+
+  if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 ||
+      index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 ||
+      index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2) {
+    if (n_kv_heads_ < dp_local_tp_size_) {
+      int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_);
 
+      tp_rank = tp_rank / repeat_times;
+      tp_size = n_kv_heads_;
+    }
+  }
   if (is_sharded) {
-    tmp_tensor = get_sharded_tensor(state_dict,
-                                    name,
-                                    shard_map.at(index),
-                                    dp_local_tp_rank_,
-                                    dp_local_tp_size_)
+    tmp_tensor = get_sharded_tensor(
+                     state_dict, name, shard_map.at(index), tp_rank, tp_size)
                      .to(device_);
   } else {
     tmp_tensor = tensor.to(device_);
diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h
index 5f76cf0ae..45bb21e15 100644
--- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h
+++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h
@@ -190,6 +190,7 @@ class NpuQwen3MoeDecoderLayerImpl : public NpuBaseLayer {
   int32_t start_expert_id_;
   int32_t end_expert_id_;
   int32_t ep_rank_;
+  int32_t n_kv_heads_;
 
   int32_t dp_size_;
   int32_t dp_local_tp_size_;

From f04b64d88dc34c51627adbf986400b063c8734e0 Mon Sep 17 00:00:00 2001
From: yingxudeng <dengyingxu1@jd.com>
Date: Tue, 25 Nov 2025 18:18:05 +0800
Subject: [PATCH 2/3] bugfix: resolve qwen3-moe quantization inference errors
 with TP > 4.

---
 xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
index 5ba88dc32..36577ced6 100755
--- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
@@ -557,7 +557,9 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights(
 
   if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 ||
       index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 ||
-      index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2) {
+      index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2 ||
+      index == IN_QKV_OFFSET_1 || index == IN_QKV_OFFSET_2 ||
+      index == IN_QKV_SCALE_1 || index == IN_QKV_SCALE_2) {
     if (n_kv_heads_ < dp_local_tp_size_) {
       int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_);
 

From 7a7441e29cc4027d2e46dfe1062130591c6b8be5 Mon Sep 17 00:00:00 2001
From: yingxudeng <dengyingxu1@jd.com>
Date: Tue, 25 Nov 2025 18:47:18 +0800
Subject: [PATCH 3/3] refactor: optimize QKV tensor index lookup using
 std::unordered_set.

---
 .../npu/npu_qwen3_moe_decoder_layer_impl.cpp  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
index 36577ced6..ffdf3792b 100755
--- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
+++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <gflags/gflags.h>
 
+#include <unordered_set>
+
 #include "common/global_flags.h"
 
 namespace xllm {
@@ -555,11 +557,18 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights(
   int32_t tp_rank = dp_local_tp_rank_;
   int32_t tp_size = dp_local_tp_size_;
 
-  if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 ||
-      index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 ||
-      index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2 ||
-      index == IN_QKV_OFFSET_1 || index == IN_QKV_OFFSET_2 ||
-      index == IN_QKV_SCALE_1 || index == IN_QKV_SCALE_2) {
+  static const std::unordered_set<int> qkv_tensor_indices = {IN_QKV_WEIGHT_1,
+                                                             IN_QKV_WEIGHT_2,
+                                                             IN_QKV_BIAS_1,
+                                                             IN_QKV_BIAS_2,
+                                                             IN_QKV_DESCALE_1,
+                                                             IN_QKV_DESCALE_2,
+                                                             IN_QKV_OFFSET_1,
+                                                             IN_QKV_OFFSET_2,
+                                                             IN_QKV_SCALE_1,
+                                                             IN_QKV_SCALE_2};
+
+  if (qkv_tensor_indices.count(index) > 0) {
     if (n_kv_heads_ < dp_local_tp_size_) {
       int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_);