Optimize custom all reduce (#130)

iotamudelta · web-flow · commit 636ff019a1c9 · 2024-08-14T11:07:24.000-04:00
* First version

* Revert error.

While there, add missing finalize.

* Use the correct defaults for ROCm.

Increase sampling area to capture crossover.

* Scope end_sync as well.

* Guard only volatile keyword for ifndef USE_ROCM

* Document crossover
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
@@ -43,7 +43,12 @@ struct __align__(16) RankData { const void* ptrs[8]; };
 struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
 #endif
 
-struct __align__(16) RankSignals { volatile Signal* signals[8]; };
+struct __align__(16) RankSignals {
+#ifndef USE_ROCM
+  volatile
+#endif
+      Signal* signals[8];
+};
 
 // like std::array, but aligned
 template <typename T, int sz>
@@ -136,25 +141,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
 // This function is meant to be used as the first synchronization in the all
 // reduce kernel. Thus, it doesn't need to make any visibility guarantees for
 // prior memory accesses. Note: volatile writes will not be reordered against
-// other volatile writes.
+// other volatile writes (CUDA-only).
 template <int ngpus>
-DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
-                        int rank) {
 #ifdef USE_ROCM
+DINLINE void start_sync(const RankSignals& sg, Signal* self_sg, int rank) {
   uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
+    __scoped_atomic_store_n(&self_sg->end[blockIdx.x][threadIdx.x], 0,
+                            __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
     // simultaneously write to the corresponding flag of all ranks.
     // Latency = 1 p2p write
-    __atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank], flag,
-                     __ATOMIC_RELAXED);
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    __atomic_thread_fence(__ATOMIC_ACQ_REL);
     // wait until we got true from all ranks
-    while (__atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
-                           __ATOMIC_RELAXED) < flag);
+    while (!__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                   __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE));
   }
   __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
 #else
+DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
+                        int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
     // reset flag for next time
     self_sg->end[blockIdx.x][threadIdx.x] = 0;
@@ -165,36 +174,38 @@ DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
     while (!self_sg->start[blockIdx.x][threadIdx.x]);
   }
   __syncthreads();
-#endif
 }
+#endif
 
 // This function is meant to be used as the second or the final synchronization
 // barrier in the all reduce kernel. If it's the final synchronization barrier,
 // we don't need to make any visibility guarantees for prior memory accesses.
 template <int ngpus, bool final_sync = false>
-DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
-                      int rank) {
 #ifdef USE_ROCM
+DINLINE void end_sync(const RankSignals& sg, Signal* self_sg, int rank) {
   __syncthreads();
   // eliminate the case that prior writes are not visible after signals become
   // visible. Note that I did not managed to make this happen through a lot of
   // testing. Might be the case that hardware provides stronger guarantee than
   // the memory model.
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
+    // reset flag for next time
+    __scoped_atomic_store_n(&self_sg->start[blockIdx.x][threadIdx.x], 0,
+                            __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
     // simultaneously write to the corresponding flag of all ranks.
     // Latency = 1 p2p write
-    __atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], flag,
-                     final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE);
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], 1,
+                            __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    __atomic_thread_fence(__ATOMIC_ACQ_REL);
     // wait until we got true from all ranks
-    while (__atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
-                           final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE) <
-           flag);
+    while (!__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                                   __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE));
   }
-  __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+  if constexpr (!final_sync) __syncthreads();
+}
 #else
+DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
+                      int rank) {
   __syncthreads();
   // eliminate the case that prior writes are not visible after signals become
   // visible. Note that I did not managed to make this happen through a lot of
@@ -211,8 +222,8 @@ DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
     while (!self_sg->end[blockIdx.x][threadIdx.x]);
   }
   if constexpr (!final_sync) __syncthreads();
-#endif
 }
+#endif
 
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
@@ -227,8 +238,11 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) {
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
     cross_device_reduce_1stage(RankData* _dp, RankSignals sg,
-                               volatile Signal* self_sg, T* __restrict__ result,
-                               int rank, int size) {
+#ifndef USE_ROCM
+                               volatile
+#endif
+                               Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
   using P = typename packed_t<T>::P;
   using A = typename packed_t<T>::A;
   // note: we don't reorder the address so the accumulation order is the same
@@ -244,15 +258,22 @@ __global__ void __launch_bounds__(512, 1)
 }
 
 template <typename P>
-DINLINE P* get_tmp_buf(volatile Signal* sg) {
+DINLINE P* get_tmp_buf(
+#ifndef USE_ROCM
+    volatile
+#endif
+    Signal* sg) {
   return (P*)(((Signal*)sg) + 1);
 }
 
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
     cross_device_reduce_2stage(RankData* _dp, RankSignals sg,
-                               volatile Signal* self_sg, T* __restrict__ result,
-                               int rank, int size) {
+#ifndef USE_ROCM
+                               volatile
+#endif
+                               Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = gridDim.x * blockDim.x;
   using P = typename packed_t<T>::P;
@@ -455,37 +476,41 @@ class CustomAllreduce {
    */
   template <typename T>
   void allreduce(cudaStream_t stream, T* input, T* output, int size,
+#ifdef USE_ROCM
+                 int threads = 512, int block_limit = 18){
+#else
                  int threads = 512, int block_limit = 36) {
-    auto d = packed_t<T>::P::size;
-    if (size % d != 0)
+#endif
+      auto d = packed_t<T>::P::size;
+  if (size % d != 0)
+    throw std::runtime_error(
+        "custom allreduce currently requires input length to be multiple "
+        "of " +
+        std::to_string(d));
+  if (block_limit > kMaxBlocks)
+    throw std::runtime_error("max supported block limit is " +
+                             std::to_string(kMaxBlocks) + ". Got " +
+                             std::to_string(block_limit));
+
+  RankData* ptrs;
+  cudaStreamCaptureStatus status;
+  CUDACHECK(cudaStreamIsCapturing(stream, &status));
+  if (status == cudaStreamCaptureStatusActive) {
+    ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+    graph_unreg_buffers_.push_back(input);
+  } else {
+    auto it = buffers_.find(input);
+    if (it == buffers_.end())
       throw std::runtime_error(
-          "custom allreduce currently requires input length to be multiple "
-          "of " +
-          std::to_string(d));
-    if (block_limit > kMaxBlocks)
-      throw std::runtime_error("max supported block limit is " +
-                               std::to_string(kMaxBlocks) + ". Got " +
-                               std::to_string(block_limit));
-
-    RankData* ptrs;
-    cudaStreamCaptureStatus status;
-    CUDACHECK(cudaStreamIsCapturing(stream, &status));
-    if (status == cudaStreamCaptureStatusActive) {
-      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
-      graph_unreg_buffers_.push_back(input);
-    } else {
-      auto it = buffers_.find(input);
-      if (it == buffers_.end())
-        throw std::runtime_error(
-            "buffer address " +
-            std::to_string(reinterpret_cast<uint64_t>(input)) +
-            " is not registered!");
-      ptrs = it->second;
-    }
+          "buffer address " +
+          std::to_string(reinterpret_cast<uint64_t>(input)) +
+          " is not registered!");
+    ptrs = it->second;
+  }
 
-    size /= d;
-    auto bytes = size * sizeof(typename packed_t<T>::P);
-    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+  size /= d;
+  auto bytes = size * sizeof(typename packed_t<T>::P);
+  int blocks = std::min(block_limit, (size + threads - 1) / threads);
 #define KL(ngpus, name)                                                       \
   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                  rank_, size);
@@ -504,27 +529,27 @@ class CustomAllreduce {
     break;                                            \
   }
 
-    switch (world_size_) {
-      REDUCE_CASE(2)
-      REDUCE_CASE(4)
-      REDUCE_CASE(6)
-      REDUCE_CASE(8)
-      default:
-        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
-            "gpus = " +
-            std::to_string(world_size_));
-    }
+  switch (world_size_) {
+    REDUCE_CASE(2)
+    REDUCE_CASE(4)
+    REDUCE_CASE(6)
+    REDUCE_CASE(8)
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+          "gpus = " +
+          std::to_string(world_size_));
+  }
 #undef REDUCE_CASE
 #undef KL
-  }
+}
 
-  ~CustomAllreduce() {
-    for (auto [_, ptr] : ipc_handles_) {
-      CUDACHECK(cudaIpcCloseMemHandle(ptr));
-    }
+~CustomAllreduce() {
+  for (auto [_, ptr] : ipc_handles_) {
+    CUDACHECK(cudaIpcCloseMemHandle(ptr));
   }
-};
+}
+};  // namespace vllm
 /**
  * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
  a template instantiation:
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
@@ -330,10 +330,17 @@ int main(int argc, char** argv) {
   //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
   //   }
   // }
+#ifdef USE _ROCM
+  for (int sz = 512; sz <= (8 << 22); sz *= 2) {
+    run<half>(myRank, nRanks, comm, 512, 18, sz + 8 * 47, performance_test);
+  }
+#else
   for (int sz = 512; sz <= (8 << 20); sz *= 2) {
     run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
   }
+#endif
 
   cudaProfilerStop();
+  MPICHECK(MPI_Finalize());
   return EXIT_SUCCESS;
 }
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -199,10 +199,18 @@ def initialize_model_parallel(
     if _ENABLE_CUSTOM_ALL_REDUCE:
         from vllm.distributed.device_communicators.custom_all_reduce import (
             CustomAllreduce)
-        _TP_CA_COMMUNICATOR = CustomAllreduce(
-            group=_TP_CPU_GROUP,
-            device=_LOCAL_RANK,
-        )
+
+        # max size defaults to 8 MiB, increase to 16 MiB on ROCm
+        # due to later crossover
+        if is_hip():
+            _TP_CA_COMMUNICATOR = CustomAllreduce(group=_TP_CPU_GROUP,
+                                                  device=_LOCAL_RANK,
+                                                  max_size=2 * 8192 * 1024)
+        else:
+            _TP_CA_COMMUNICATOR = CustomAllreduce(
+                group=_TP_CPU_GROUP,
+                device=_LOCAL_RANK,
+            )
 
     # Build the pipeline model-parallel groups.
     global _PP_DEVICE_GROUP, _PP_CPU_GROUP