Wrap kernel calls with a device guard (#19)

nandor · web-flow · commit 7e7031cf7475 · 2025-05-13T13:39:22.000+01:00
diff --git a/csrc/all_to_all/all_to_all.h b/csrc/all_to_all/all_to_all.h
@@ -40,6 +40,9 @@ class AllToAll {
 
   virtual ~AllToAll();
 
+  /// @brief Returns the number of experts each token is routed to.
+  size_t getNumExpertsPerToken() const { return expertsPerToken; }
+
 protected:
   /// The maximum number of tokens per DP group.
   const size_t maxNumTokens;
diff --git a/csrc/bindings/all_to_all_ops.cpp b/csrc/bindings/all_to_all_ops.cpp
@@ -5,6 +5,7 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
@@ -152,6 +153,15 @@ void dispatch(
   }
 
   auto *all_to_all = (Kernel *)ptr;
+
+  TORCH_CHECK(indices.size(0) == dpX.size(0), "indices.size(0) must be equal to dpX.size(0)");
+  TORCH_CHECK(
+      indices.size(1) == all_to_all->getNumExpertsPerToken(),
+      "indices.size(1) must be equal to the experts per token"
+  );
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(indices));
+
   all_to_all->dispatch(
       Strided1D<int32_t>(
           outExpertNumTokens.data_ptr<int32_t>(), (size_t)outExpertNumTokens.stride(0)
@@ -237,6 +247,8 @@ void combine(
 
   auto *all_to_all = (Kernel *)ptr;
 
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(indices));
+
   switch (expertY.scalar_type()) {
   case at::kFloat: {
     switch (outTokens.scalar_type()) {