From a344e798f6e65efa6a7cba5fd26955ea7d5bf82c Mon Sep 17 00:00:00 2001 From: yingxudeng Date: Mon, 24 Nov 2025 17:30:23 +0800 Subject: [PATCH 1/3] feat: enhance Qwen3-MoE to support TP settings beyond 4. --- .../npu/npu_qwen3_moe_decoder_layer_impl.cpp | 34 ++++++++++++++----- .../npu/npu_qwen3_moe_decoder_layer_impl.h | 1 + 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp index c2a840f91..5ba88dc32 100755 --- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp @@ -233,6 +233,8 @@ NpuQwen3MoeDecoderLayerImpl::NpuQwen3MoeDecoderLayerImpl( CHECK_EQ(parallel_args.world_size(), dp_size_ * dp_local_tp_size_); dp_local_tp_rank_ = parallel_args.rank() % dp_local_tp_size_; + n_kv_heads_ = static_cast(model_args.n_kv_heads().value()); + param_from_args(prefill_param_, model_args, parallel_args, true); param_from_args(decode_param_, model_args, parallel_args, false); initialize_tensors(options); @@ -345,8 +347,8 @@ void NpuQwen3MoeDecoderLayerImpl::initialize_basic_parameters( param.rmsnormQKNorm = true; param.hiddenSizePerAttentionHead = args.head_dim(); std::optional optionalValue = args.n_kv_heads(); - param.numKeyValueHeadsPerRank = - static_cast(optionalValue.value()) / parallel_args.world_size(); + param.numKeyValueHeadsPerRank = std::max( + 1, static_cast(optionalValue.value()) / parallel_args.world_size()); param.numAttentionHeadsPerRank = args.n_heads() / dp_local_tp_size_; param.attnLinearTransposeType = {1, -1, -1, 1, -1, -1}; @@ -390,8 +392,15 @@ void NpuQwen3MoeDecoderLayerImpl::initialize_mlp_parameters( void NpuQwen3MoeDecoderLayerImpl::initialize_parallel_parameters( atb_speed::qwen::MoeDecoderLayerParam& param, const ParallelArgs& parallel_args) { - param.lmHeadLocalTp = 0; + param.lmHeadLocalTp = dp_local_tp_size_; param.mapping = parallel_args.mapping(); + param.tensorParallelInfo = {parallel_args.rank(), + parallel_args.world_size(), + FLAGS_communication_backend, + FLAGS_rank_tablefile, + nullptr, + ""}; + param.maxDecodeDpTokenSize = 0; // TODO } @@ -543,13 +552,22 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights( const int index = get_mapped_index(name, weight_mapping); const bool is_sharded = shard_map.count(index); torch::Tensor tmp_tensor; + int32_t tp_rank = dp_local_tp_rank_; + int32_t tp_size = dp_local_tp_size_; + + if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 || + index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 || + index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2) { + if (n_kv_heads_ < dp_local_tp_size_) { + int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_); + tp_rank = tp_rank / repeat_times; + tp_size = n_kv_heads_; + } + } if (is_sharded) { - tmp_tensor = get_sharded_tensor(state_dict, - name, - shard_map.at(index), - dp_local_tp_rank_, - dp_local_tp_size_) + tmp_tensor = get_sharded_tensor( + state_dict, name, shard_map.at(index), tp_rank, tp_size) .to(device_); } else { tmp_tensor = tensor.to(device_); diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h index 5f76cf0ae..45bb21e15 100644 --- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h +++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.h @@ -190,6 +190,7 @@ class NpuQwen3MoeDecoderLayerImpl : public NpuBaseLayer { int32_t start_expert_id_; int32_t end_expert_id_; int32_t ep_rank_; + int32_t n_kv_heads_; int32_t dp_size_; int32_t dp_local_tp_size_; From f04b64d88dc34c51627adbf986400b063c8734e0 Mon Sep 17 00:00:00 2001 From: yingxudeng Date: Tue, 25 Nov 2025 18:18:05 +0800 Subject: [PATCH 2/3] bugfix: resolve qwen3-moe quantization inference errors with TP > 4. --- xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp index 5ba88dc32..36577ced6 100755 --- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp @@ -557,7 +557,9 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights( if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 || index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 || - index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2) { + index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2 || + index == IN_QKV_OFFSET_1 || index == IN_QKV_OFFSET_2 || + index == IN_QKV_SCALE_1 || index == IN_QKV_SCALE_2) { if (n_kv_heads_ < dp_local_tp_size_) { int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_); From 7a7441e29cc4027d2e46dfe1062130591c6b8be5 Mon Sep 17 00:00:00 2001 From: yingxudeng Date: Tue, 25 Nov 2025 18:47:18 +0800 Subject: [PATCH 3/3] refactor: optimize QKV tensor index lookup using std::unordered_set. --- .../npu/npu_qwen3_moe_decoder_layer_impl.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp index 36577ced6..ffdf3792b 100755 --- a/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp +++ b/xllm/core/layers/npu/npu_qwen3_moe_decoder_layer_impl.cpp @@ -17,6 +17,8 @@ limitations under the License. #include +#include + #include "common/global_flags.h" namespace xllm { @@ -555,11 +557,18 @@ void NpuQwen3MoeDecoderLayerImpl::process_general_weights( int32_t tp_rank = dp_local_tp_rank_; int32_t tp_size = dp_local_tp_size_; - if (index == IN_QKV_WEIGHT_1 || index == IN_QKV_WEIGHT_2 || - index == IN_QKV_BIAS_1 || index == IN_QKV_BIAS_2 || - index == IN_QKV_DESCALE_1 || index == IN_QKV_DESCALE_2 || - index == IN_QKV_OFFSET_1 || index == IN_QKV_OFFSET_2 || - index == IN_QKV_SCALE_1 || index == IN_QKV_SCALE_2) { + static const std::unordered_set qkv_tensor_indices = {IN_QKV_WEIGHT_1, + IN_QKV_WEIGHT_2, + IN_QKV_BIAS_1, + IN_QKV_BIAS_2, + IN_QKV_DESCALE_1, + IN_QKV_DESCALE_2, + IN_QKV_OFFSET_1, + IN_QKV_OFFSET_2, + IN_QKV_SCALE_1, + IN_QKV_SCALE_2}; + + if (qkv_tensor_indices.count(index) > 0) { if (n_kv_heads_ < dp_local_tp_size_) { int32_t repeat_times = (dp_local_tp_size_ / n_kv_heads_);