[SymmetricMemory] introduce a binding for cuMemset32Async (pytorch#138755)

Yifu Wang · pytorchmergebot · commit ee42a997455d · 2024-11-05T18:47:24.000Z
## This Stack This stack does the following things to support `xformers`-style, comm-aware Triton kernels: - Exposes `signal_pad`s as tensors in Python - Adds a binding for `cuMemsetAsync` These in combination aims to provide users with more flexibility to express custom signaling/synchronization patterns. ## This PR Make `cuMemset32Async` available via `_SymmetricMemory.memset32`. We chose `cuMemset32Async` over `cudaMemsetAsync` because it allows for `uint32_t`-wise memset. This provides users with better flexibility. To enable this, we also added the following cuda driver APIs in `c10::cuda::DriverAPI`: - `cuDevicePrimaryCtxRetain` - for obtaining the primary context of a device in the form of `CUcontext`. - `cuCtxGetCurrent`/`cuCtxSetCurrent` - for setting and restoring the context for cuda driver APIs such as `cuMemset32Async`. Pull Request resolved: pytorch#138755 Approved by: https://github.com/weifengpy, https://github.com/eqy, https://github.com/lw
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -70,8 +70,6 @@ include_patterns = [
     'aten/src/ATen/native/cudnn/*.cpp',
     'c10/**/*.h',
     'c10/**/*.cpp',
-    'distributed/c10d/*DMAConnectivity.*',
-    'distributed/c10d/*SymmetricMemory.*',
     'torch/csrc/**/*.h',
     'torch/csrc/**/*.hpp',
     'torch/csrc/**/*.cpp',
diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h
@@ -30,6 +30,7 @@
   _(cuMemGetAllocationGranularity)  \
   _(cuMemExportToShareableHandle)   \
   _(cuMemImportFromShareableHandle) \
+  _(cuMemsetD32Async)               \
   _(cuStreamWriteValue32)           \
   _(cuGetErrorString)
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -562,6 +562,7 @@ if(USE_CUDA)
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
         PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
       )
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
@@ -24,9 +24,11 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
+    requires_cuda,
     run_tests,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
+    TestCase,
 )
 
 
@@ -849,5 +851,32 @@ def func_3(x):
         self.assertNotIn("return (buf0", code_3)
 
 
+class SymmMemSingleProcTest(TestCase):
+    @skipIfRocm
+    @requires_cuda
+    def test_memset32(self):
+        t = _SymmetricMemory.empty_strided_p2p(
+            (64,),
+            (1,),
+            dtype=torch.uint32,
+            device=torch.device("cuda:0"),
+            group_name="0",
+        ).fill_(0)
+
+        _SymmetricMemory.memset32(t, offset=32, val=1, count=16)
+        self.assertTrue(t[:32].eq(0).all())
+        self.assertTrue(t[32:48].eq(1).all())
+        self.assertTrue(t[48:].eq(0).all())
+
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.memset32(t, offset=-1, val=1, count=16)
+
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.memset32(t, offset=32, val=4294967296, count=16)
+
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.memset32(t, offset=32, val=1, count=-1)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
@@ -1,8 +1,14 @@
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
-
 #include <ATen/ATen.h>
 #include <ATen/ceil_div.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#endif
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -11,8 +17,6 @@
 #include <ATen/ops/empty_like.h>
 #endif
 
-#include <torch/library.h>
-
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp>
 
@@ -491,7 +495,61 @@ at::Tensor two_shot_all_reduce_(
   return input;
 }
 
+} // namespace
+#endif // #if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
+
+namespace {
+
+at::Tensor memset32_(
+    at::Tensor& input,
+    int64_t offset,
+    int64_t val,
+    int64_t count) {
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  TORCH_CHECK(
+      input.dim() == 1 && input.is_contiguous() &&
+          input.scalar_type() == c10::ScalarType::UInt32,
+      "symm_mem::memset32_: input must be a flat, contiguous uint32 tensor.");
+
+  TORCH_CHECK(
+      offset > 0 && count > 0,
+      "symm_mem::memset32_: offset and count must be positive integers.");
+
+  TORCH_CHECK(
+      val >= 0 &&
+          static_cast<size_t>(val) <= std::numeric_limits<uint32_t>::max(),
+      "symm_mem::memset32_: val must be in the range of "
+      "[0, 4294967295] (uint32_t).")
+
+  auto element_size = c10::elementSize(input.scalar_type());
+  TORCH_CHECK(
+      offset + count < input.numel(),
+      "symm_mem::memset32_: offset + count (",
+      offset + count,
+      ") exceeded the numel of the input (",
+      input.numel(),
+      ")");
+
+  auto addr = reinterpret_cast<uint32_t*>(input.data_ptr()) + offset;
+
+  c10::cuda::CUDAGuard guard(input.device());
+  auto driver_api = c10::cuda::DriverAPI::get();
+  C10_CUDA_DRIVER_CHECK(driver_api->cuMemsetD32Async_(
+      reinterpret_cast<CUdeviceptr>(addr),
+      val,
+      count,
+      at::cuda::getCurrentCUDAStream()));
+#else
+  TORCH_CHECK(
+      false, "CUDASymmetricMemory requires PYTORCH_C10_DRIVER_API_SUPPORTED");
+#endif
+  return input;
+}
+
+} // namespace
+
 TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
   m.def(
       "multimem_all_reduce_(Tensor(a!) input, str reduce_op, str group_name) -> Tensor(a!)",
       torch::dispatch(c10::DispatchKey::CUDA, ::multimem_all_reduce_),
@@ -519,8 +577,12 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
       "one_shot_all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor",
       {at::Tag::pt2_compliant_tag});
 
-  m.impl("one_shot_all_reduce", torch::dispatch(c10::DispatchKey::Meta, ::one_shot_all_reduce_meta));
-  m.impl("one_shot_all_reduce", torch::dispatch(c10::DispatchKey::CUDA, ::one_shot_all_reduce));
+  m.impl(
+      "one_shot_all_reduce",
+      torch::dispatch(c10::DispatchKey::Meta, ::one_shot_all_reduce_meta));
+  m.impl(
+      "one_shot_all_reduce",
+      torch::dispatch(c10::DispatchKey::CUDA, ::one_shot_all_reduce));
 
   m.def(
       "one_shot_all_reduce_out(Tensor input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)",
@@ -531,8 +593,9 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
       "two_shot_all_reduce_(Tensor(a!) input, str reduce_op, str group_name) -> Tensor(a!)",
       torch::dispatch(c10::DispatchKey::CUDA, ::two_shot_all_reduce_),
       {at::Tag::pt2_compliant_tag});
-}
-
-} // namespace
-
 #endif
+  m.def(
+      "memset32_(Tensor(a!) input, int offset, int val, int count) -> Tensor(a!)",
+      torch::dispatch(c10::DispatchKey::CUDA, ::memset32_),
+      {at::Tag::pt2_compliant_tag});
+}
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1122,7 +1122,23 @@ This class does not support ``__members__`` property.)");
           "stream_write_value32",
           &SymmetricMemory::stream_write_value32,
           py::arg("addr"),
-          py::arg("val"));
+          py::arg("val"))
+      // Util functions that are often used together with symmetric memory but
+      // not necessarily directly on symmetric memory.
+      .def_static(
+          "memset32",
+          [](at::Tensor& input, int64_t offset, int64_t val, int64_t count) {
+            // The range of `val` is checked inside the op
+            auto op = c10::Dispatcher::singleton()
+                          .findSchemaOrThrow("symm_mem::memset32_", "")
+                          .typed<at::Tensor(
+                              at::Tensor&, int64_t, int64_t, int64_t)>();
+            return op.call(input, offset, val, count);
+          },
+          py::arg("input"),
+          py::arg("offset"),
+          py::arg("val"),
+          py::arg("count") = 1);
 
   auto store =
       py::class_<::c10d::Store, c10::intrusive_ptr<::c10d::Store>, PythonStore>(

Original file line number	Diff line number	Diff line change
`@@ -562,6 +562,7 @@ if(USE_CUDA)`
`562`	`562`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp`
`563`	`563`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp`
`564`	`564`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu`
	`565`	`+ ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu`
`565`	`566`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp`
`566`	`567`	`PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"`
`567`	`568`	`)`