jd-opensource · yingxudeng · Nov 3, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 7, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -330,6 +330,7 @@ else()
 endif()
 
 if(USE_NPU)
+  # add_definitions(-DUSE_NPU_TORCH)
   add_definitions(-DUSE_NPU)
   add_definitions(-DBUILD_LIBTORCH)
   add_definitions(-DTORCH_SETCUSTOMHANDLER=ON)
@@ -341,6 +342,7 @@ if(USE_NPU)
       $ENV{PYTORCH_INSTALL_PATH}/include
       $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
       $ENV{PYTORCH_NPU_INSTALL_PATH}/include
+      $ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/distributed
       $ENV{NPU_HOME_PATH}/include
       $ENV{ATB_HOME_PATH}/include
       $ENV{NPU_HOME_PATH}/opp/vendors/xllm/op_api/include/

diff --git a/cmake/cc_test.cmake b/cmake/cc_test.cmake
@@ -69,6 +69,14 @@ function(cc_test)
     PRIVATE ${CC_TEST_LINKOPTS}
   )
 
+  if(USE_NPU)
+    set(COMMON_LIBS Python::Python torch_npu torch_python)
+  endif()
+
+  if(USE_NPU AND DEFINED COMMON_LIBS)
+    target_link_libraries(${CC_TEST_NAME} PRIVATE ${COMMON_LIBS})
+  endif()
+
   add_dependencies(all_tests ${CC_TEST_NAME})
 
   gtest_add_tests(

diff --git a/xllm/CMakeLists.txt b/xllm/CMakeLists.txt
@@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)
-  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext)
+  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext torch_npu torch_python)
 elseif(USE_MLU)
   set(COMMON_LIBS Python::Python)
 endif()

diff --git a/xllm/core/common/CMakeLists.txt b/xllm/core/common/CMakeLists.txt
@@ -28,6 +28,7 @@ cc_library(
     absl::random_random
     absl::strings
     torch
+    $<$<BOOL:${USE_NPU}>:torch_python>
     $<$<BOOL:${USE_NPU}>:torch_npu>
     $<$<BOOL:${USE_MSPTI}>:mspti>
     $<$<BOOL:${USE_NPU}>:ms_tools_ext>

diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -410,4 +410,7 @@ DEFINE_bool(
     false,
     "Whether to enable prefetch weight,only applicable to Qwen3-dense model."
     "The default prefetching ratio for gateup weight is 40%."
-    "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5");
+    "If adjustments are needed, e.g. export PREFETCH_COEFFOCIENT=0.5");
+
+
+DEFINE_bool(enable_npu_torch, true, "Whether to enable native NPU support.");
diff --git a/xllm/core/common/global_flags.h b/xllm/core/common/global_flags.h
@@ -206,3 +206,17 @@ DECLARE_bool(enable_shm);
 DECLARE_bool(enable_prefetch_weight);
 
 DECLARE_int32(flashinfer_workspace_buffer_size);
+
+DECLARE_bool(enable_prefetch_weight);
+
+DECLARE_int32(flashinfer_workspace_buffer_size);
+
+DECLARE_bool(enable_prefetch_weight);
+
+DECLARE_int32(flashinfer_workspace_buffer_size);
+
+DECLARE_bool(enable_prefetch_weight);
+
+DECLARE_int32(flashinfer_workspace_buffer_size);
+
+DECLARE_bool(enable_npu_torch);
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
@@ -12,6 +12,7 @@ cc_binary(
     :models
     :model
     :distributed_runtime
+    :parallel_state
     absl::strings
     xllm_kernels
     ascendcl

diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
@@ -104,6 +104,12 @@ void WorkerServer::create_server(
   const ParallelArgs* parallel_args = comm.parallel_args();
 #if defined(USE_MLU) || defined(USE_CUDA)
   comm.create_process_groups(master_node_addr, device);
+#elif defined(USE_NPU)
+  // TODO: Refactor to use model_type or other appropriate enumeration for
+  // condition checking
+  if (FLAGS_enable_npu_torch) {
+    comm.create_process_groups(master_node_addr, device);
+  }
 #endif
 
   WorkerType worker_type =

diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
@@ -18,10 +18,10 @@ set(BASE_DEPS
 if(USE_NPU)
   list(APPEND BASE_DEPS :npu_layers)
   list(APPEND BASE_DEPS :platform_npu)
-else()
-  list(APPEND BASE_DEPS :common_layers)
 endif()
 
+list(APPEND BASE_DEPS :common_layers)
+
 
 # Define the library
 cc_library(

diff --git a/xllm/core/framework/model/causal_lm.h b/xllm/core/framework/model/causal_lm.h
@@ -66,11 +66,13 @@ class CausalLM : public torch::nn::Module {
 
   virtual const torch::TensorOptions& options() const = 0;
 
-  virtual layer::LmHead get_lm_head() = 0;
-  virtual void set_lm_head(layer::LmHead& head) = 0;
-  virtual std::vector<layer::WordEmbedding> get_word_embedding() = 0;
+#if defined(USE_NPU)
+  virtual layer::NpuLmHead get_lm_head() = 0;
+  virtual void set_lm_head(layer::NpuLmHead& head) = 0;
+  virtual std::vector<layer::NpuWordEmbedding> get_word_embedding() = 0;
   virtual void set_word_embedding(
-      std::vector<layer::WordEmbedding>& embedding) = 0;
+      std::vector<layer::NpuWordEmbedding>& embedding) = 0;
+#endif
 };
 
 template <typename Model>
@@ -104,20 +106,22 @@ class CausalLMImpl : public CausalLM {
   virtual void update_expert_weight(int32_t layer_id) {
     return model_->update_expert_weight(layer_id);
   }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() override { return model_->get_lm_head(); };
 
-  layer::LmHead get_lm_head() override { return model_->get_lm_head(); };
-
-  void set_lm_head(layer::LmHead& head) override { model_->set_lm_head(head); };
+  void set_lm_head(layer::NpuLmHead& head) override {
+    model_->set_lm_head(head);
+  };
 
-  std::vector<layer::WordEmbedding> get_word_embedding() override {
+  std::vector<layer::NpuWordEmbedding> get_word_embedding() override {
     return model_->get_word_embedding();
   };
 
   void set_word_embedding(
-      std::vector<layer::WordEmbedding>& embedding) override {
+      std::vector<layer::NpuWordEmbedding>& embedding) override {
     model_->set_word_embedding(embedding);
   };
-
+#endif
   torch::Device device() const override { return options_.device(); }
 
   const torch::TensorOptions& options() const override { return options_; }

diff --git a/xllm/core/framework/model/causal_vlm.h b/xllm/core/framework/model/causal_vlm.h
@@ -63,20 +63,22 @@ class CausalVLMImpl : public CausalVLM {
   }
 
   virtual void update_expert_weight(int32_t layer_id) { return; }
+#if defined(USE_NPU)
+  layer::NpuLmHead get_lm_head() override { return model_->get_lm_head(); };
 
-  layer::LmHead get_lm_head() override { return model_->get_lm_head(); };
-
-  void set_lm_head(layer::LmHead& head) override { model_->set_lm_head(head); };
+  void set_lm_head(layer::NpuLmHead& head) override {
+    model_->set_lm_head(head);
+  };
 
-  std::vector<layer::WordEmbedding> get_word_embedding() override {
+  std::vector<layer::NpuWordEmbedding> get_word_embedding() override {
     return model_->get_word_embedding();
   };
 
   void set_word_embedding(
-      std::vector<layer::WordEmbedding>& embedding) override {
+      std::vector<layer::NpuWordEmbedding>& embedding) override {
     model_->set_word_embedding(embedding);
   };
-
+#endif
   torch::Device device() const override { return options_.device(); }
 
   const torch::TensorOptions& options() const override { return options_; }

diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp
@@ -18,6 +18,7 @@ limitations under the License.
 #include "mapping_npu.h"
 
 #if defined(USE_NPU)
+#include "npu_process_group.h"
 #include "xllm_kernels/core/include/atb_speed/base/external_comm_manager.h"
 #include "xllm_kernels/core/include/atb_speed/utils/singleton.h"
 #include "xllm_kernels/models/base/param/mapping.h"
@@ -30,23 +31,6 @@ limitations under the License.
 #include "parallel_args.h"
 #include "util/net.h"
 
-namespace {
-#if defined(USE_NPU)
-std::unique_ptr<xllm::ProcessGroup> create_process_group(
-    int rank,
-    int world_size,
-    int rank_size,
-    int port,
-    bool trans,
-    const std::string& host,
-    const std::string& group_name,
-    const torch::Device& device) {
-  LOG(FATAL) << "Unsupported device type";
-  return nullptr;
-}
-#endif
-}  // namespace
-
 namespace xllm {
 
 CollectiveCommunicator::CollectiveCommunicator(int global_rank,

diff --git a/xllm/core/framework/parallel_state/npu_process_group.cpp b/xllm/core/framework/parallel_state/npu_process_group.cpp
@@ -14,6 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "npu_process_group.h"
+#ifdef TORCH_HIGHER_THAN_PTA6
+#include <torch_npu/csrc/framework/OpCommand.h>
+#else
+#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
+#include <torch_npu/csrc/framework/utils/OpPreparation.h>
+#endif
+
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/TCPStore.hpp>
+#include <torch_npu/csrc/distributed/ProcessGroupHCCL.hpp>
 
 namespace {
 
@@ -24,113 +34,65 @@ namespace {
       LOG(FATAL) << "Failed, HCCL error :" << HcclGetErrorString(r); \
     }                                                                \
   } while (0)
+}  // namespace
 
-inline bool is_npu(const at::Tensor& tensor) {
-  if (!tensor.defined()) {
-    return false;
-  }
-  return tensor.device().is_privateuseone();
-}
-
-inline bool is_npu(const at::TensorOptions& options) {
-  return options.device().is_privateuseone();
-}
+namespace xllm {
 
-inline bool is_npu(const at::Device& device) {
-  return device.is_privateuseone();
-}
+ProcessGroupHCCL::ProcessGroupHCCL(int global_rank,
+                                   int world_size,
+                                   int rank_size,
+                                   int port,
+                                   bool trans,
+                                   const std::string& host,
+                                   const std::string& group_name,
+                                   const torch::Device& device)
+    : ProcessGroup(device) {
+  c10::intrusive_ptr<c10d_npu::ProcessGroupHCCL::Options> hccl_pg_options =
+      c10d_npu::ProcessGroupHCCL::Options::create();
+  // hccl_pg_options->group_name = group_name;
+  int rank = global_rank;
+  if (world_size != rank_size) {
+    auto [local_rank, group_ranks] =
+        get_group_rank(world_size, global_rank, rank_size, trans);
+    std::vector<uint32_t> uint32_ranks;
+    for (auto rank : group_ranks) {
+      uint32_ranks.push_back(static_cast<uint32_t>(rank));
+    }
+    hccl_pg_options->global_ranks_in_group = uint32_ranks;
+    rank = local_rank;
+  }
 
-at::Tensor flatten_for_scatter_gather(std::vector<at::Tensor>& tensors) {
-  auto& t = tensors[0];
-  std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
-  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  auto store = create_tcp_store(host, port, rank);
+  pg_ = std::make_unique<c10d_npu::ProcessGroupHCCL>(
+      store, rank, rank_size, hccl_pg_options);
 }
 
-HcclDataType to_hccl_data_type(const torch::Tensor& input) {
-  const auto type = input.scalar_type();
-  switch (type) {
-    case at::kFloat:
-      return HCCL_DATA_TYPE_FP32;
-    case at::kHalf:
-      return HCCL_DATA_TYPE_FP16;
-    case at::kDouble:
-      return HCCL_DATA_TYPE_FP64;
-    case at::kLong:
-      return HCCL_DATA_TYPE_INT64;
-    case at::kInt:
-      return HCCL_DATA_TYPE_INT32;
-    case at::kChar:
-      return HCCL_DATA_TYPE_INT8;
-    case at::kByte:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBool:
-      return HCCL_DATA_TYPE_UINT8;
-    case at::kBFloat16:
-      return HCCL_DATA_TYPE_BFP16;
-    default:
-      TORCH_CHECK(false, "Unconvertible HCCL type ", type);
+// Destructor.
+ProcessGroupHCCL::~ProcessGroupHCCL() {
+  if (pg_) {
+    pg_->shutdown();
+  } else {
+    HCCLCHECK(HcclCommDestroy(comm_));
   }
 }
 
-void check_input(torch::Tensor input) {
-  CHECK(is_npu(input)) << "input should be npu tensor";
-  CHECK(input.is_contiguous()) << "input should be contiguous";
-  CHECK(!input.is_sparse()) << "input have to be npu dense tensor";
-}
-
-}  // namespace
-
-namespace xllm {
-
 ProcessGroupHCCL::ProcessGroupHCCL(int rank,
                                    int world_size,
                                    const torch::Device& device,
                                    HcclComm comm)
     : ProcessGroup(device), comm_(comm) {}
-// Destructor.
-ProcessGroupHCCL::~ProcessGroupHCCL() { HCCLCHECK(HcclCommDestroy(comm_)); }
 
-void ProcessGroupHCCL::allreduce(torch::Tensor& input) {
-  DCHECK(input.device() == device())
-      << "input should be on the same device as the process group";
-  check_input(input);
-  // inplace all reduce
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // torch::DeviceGuard device_guard(device());
-  // HCCLCHECK(HcclAllReduce(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/input.data_ptr(),
-  //     /*count=*/count,
-  //     /*datatype=*/data_type,
-  //     /*op=*/HCCL_REDUCE_SUM,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-}
-void ProcessGroupHCCL::allgather(const torch::Tensor& input,
-                                 std::vector<torch::Tensor>& outputs) {
-  check_input(input);
-  // CHECK(outputs.size() == world_size())
-  //     << "outputs should have the same size as world_size";
-  // DCHECK(input.device() == device())
-  //     << "input should be on the same device as the process group";
-  // torch::DeviceGuard device_guard(device());
-  // torch::Tensor flattened_output = flatten_for_scatter_gather(outputs);
-  // const auto count = input.numel();
-  // const auto data_type = to_hccl_data_type(input);
-  // auto stream = c10_npu::getCurrentNPUStream();
-  // HCCLCHECK(HcclAllGather(
-  //     /*sendbuff=*/input.data_ptr(),
-  //     /*recvbuff=*/flattened_output.data_ptr(),
-  //     /*sendcount=*/count,
-  //     /*datatype=*/data_type,
-  //     /*comm=*/comm_,
-  //     /*stream=*/stream));
-  // // copy the flattened output tensors to the outputs.
-  // for (int i = 0; i < outputs.size(); ++i) {
-  //   outputs[i].copy_(flattened_output[i], /*non_blocking=*/true);
-  // }
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    bool trans,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device) {
+  return std::make_unique<ProcessGroupHCCL>(
+      rank, world_size, rank_size, port, trans, host, group_name, device);
 }
+
 }  // namespace xllm