reduce compilation size and time (#886)

shintaro-iwasaki · facebook-github-bot · commit bf48eeab9071 · 2022-01-31T15:39:22.000-08:00
Summary: Pull Request resolved: #886 This patch does refactoring for FBGEMM to slightly reduce compilation size and time associated with `cub`. 1. Moved an inline function `asynchronous_complete_cumsum()` from `embedding_backward_template_helpers.cuh` to `split_embeddings_utils.cu`, which is the only code that uses this function. 2. Instead of calling a template function `cub::DeviceRadixSort::SortPairs`, call a non-static function in FBGEMM to avoid expanding a template function in every `gen_embedding_backward_*` code. Reviewed By: jspark1105 Differential Revision: D33801456 fbshipit-source-id: 92ebb3369c4fea25d7360bbacaf36476cba54136
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_template.cu
@@ -797,7 +797,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     auto lxu_cache_locations_sorted = at::empty_like(lxu_cache_locations);
     if (lxu_cache_locations.size(0) > 0) {
         size_t temp_storage_bytes = 0;
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+        AT_CUDA_CHECK(radix_sort_pairs(
             nullptr,
             temp_storage_bytes,
             linear_indices.data_ptr<int64_t>(),
@@ -812,7 +812,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
         auto temp_storage = at::empty(
             {static_cast<int64_t>(temp_storage_bytes)},
             indices.options().dtype(at::kByte));
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+        AT_CUDA_CHECK(radix_sort_pairs(
             temp_storage.data_ptr(),
             temp_storage_bytes,
             linear_indices.data_ptr<int64_t>(),
@@ -838,12 +838,11 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
     {% endif %}
         "split_embedding_backward_{{ optimizer }}_exact_kernel",
         ([&] {
-
             {% if weighted %}
             auto indice_weights_sorted = at::empty_like(indice_weights);
             {
             size_t temp_storage_bytes = 0;
-            AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            AT_CUDA_CHECK(radix_sort_pairs(
                 nullptr,
                 temp_storage_bytes,
                 linear_indices.data_ptr<int64_t>(),
@@ -863,7 +862,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
             auto temp_storage = at::empty(
                 {static_cast<int64_t>(temp_storage_bytes)},
                 indices.options().dtype(at::kByte));
-            AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs(
+            AT_CUDA_CHECK(radix_sort_pairs(
                 temp_storage.data_ptr(),
                 temp_storage_bytes,
                 linear_indices.data_ptr<int64_t>(),
diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh b/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh
@@ -5,14 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// clang-format off
-#include "fbgemm_gpu/cub_namespace_prefix.cuh"
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/device/device_scan.cuh>
-#include "fbgemm_gpu/cub_namespace_postfix.cuh"
-// clang-format on
-
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/TensorUtils.h>
@@ -33,43 +25,6 @@
 #include "fbgemm_cuda_utils.cuh"
 #include "sparse_ops_utils.h"
 
-inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(t_in.get_device());
-  size_t temp_storage_bytes = 0;
-  TORCH_CHECK(t_in.is_contiguous());
-  TORCH_CHECK(t_in.dtype() == at::kInt || t_in.dtype() == at::kLong);
-  // CUB only handles up to INT_MAX elements.
-  TORCH_CHECK(t_in.numel() < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(t_in.dim() == 1);
-  auto t_out = at::empty({t_in.numel() + 1}, t_in.options());
-  t_out[0].zero_();
-  AT_DISPATCH_INTEGRAL_TYPES(
-      t_in.scalar_type(), "cub_inclusive_sum_wrapper1", ([&] {
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceScan::InclusiveSum(
-            nullptr,
-            temp_storage_bytes,
-            t_in.data_ptr<scalar_t>(),
-            t_out.data_ptr<scalar_t>() + 1,
-            t_in.numel(),
-            at::cuda::getCurrentCUDAStream()));
-      }));
-  auto temp_storage = at::empty(
-      {static_cast<int64_t>(temp_storage_bytes)},
-      t_in.options().dtype(at::kByte));
-  AT_DISPATCH_INTEGRAL_TYPES(
-      t_in.scalar_type(), "cub_inclusive_sum_wrapper2", ([&] {
-        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceScan::InclusiveSum(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            t_in.data_ptr<scalar_t>(),
-            t_out.data_ptr<scalar_t>() + 1,
-            t_in.numel(),
-            at::cuda::getCurrentCUDAStream()));
-      }));
-  return t_out;
-}
-
 class FixedDivisor {
  public:
   explicit FixedDivisor(const int32_t d) : d_(d) {
diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh
@@ -132,3 +132,27 @@ transpose_embedding_input(
     at::Tensor indices,
     at::Tensor offsets,
     bool nobag = false);
+
+// Use these functions instead of directly calling cub functions
+// to reduce code size and compilation time.
+// Arguments are the same as cub::DeviceRadixSort::SortPairs
+#define DECL_RADIX_SORT_PAIRS_FN(KeyT, ValueT) \
+  cudaError_t radix_sort_pairs(                \
+      void* d_temp_storage,                    \
+      size_t& temp_storage_bytes,              \
+      const KeyT* d_keys_in,                   \
+      KeyT* d_keys_out,                        \
+      const ValueT* d_values_in,               \
+      ValueT* d_values_out,                    \
+      int num_items,                           \
+      int begin_bit = 0,                       \
+      int end_bit = sizeof(KeyT) * 8,          \
+      cudaStream_t stream = 0,                 \
+      bool debug_synchronous = false)
+
+DECL_RADIX_SORT_PAIRS_FN(int64_t, float);
+DECL_RADIX_SORT_PAIRS_FN(int64_t, double);
+DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t);
+DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t);
+
+#undef DECL_RADIX_SORT_PAIRS_FN
diff --git a/fbgemm_gpu/src/split_embeddings_utils.cu b/fbgemm_gpu/src/split_embeddings_utils.cu
@@ -10,6 +10,51 @@
 #include <c10/cuda/CUDAStream.h>
 #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
 
+// clang-format off
+#include "fbgemm_gpu/cub_namespace_prefix.cuh"
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/device/device_scan.cuh>
+#include "fbgemm_gpu/cub_namespace_postfix.cuh"
+// clang-format on
+
+inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(t_in.get_device());
+  size_t temp_storage_bytes = 0;
+  TORCH_CHECK(t_in.is_contiguous());
+  TORCH_CHECK(t_in.dtype() == at::kInt || t_in.dtype() == at::kLong);
+  // CUB only handles up to INT_MAX elements.
+  TORCH_CHECK(t_in.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(t_in.dim() == 1);
+  auto t_out = at::empty({t_in.numel() + 1}, t_in.options());
+  t_out[0].zero_();
+  AT_DISPATCH_INTEGRAL_TYPES(
+      t_in.scalar_type(), "cub_inclusive_sum_wrapper1", ([&] {
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceScan::InclusiveSum(
+            nullptr,
+            temp_storage_bytes,
+            t_in.data_ptr<scalar_t>(),
+            t_out.data_ptr<scalar_t>() + 1,
+            t_in.numel(),
+            at::cuda::getCurrentCUDAStream()));
+      }));
+  auto temp_storage = at::empty(
+      {static_cast<int64_t>(temp_storage_bytes)},
+      t_in.options().dtype(at::kByte));
+  AT_DISPATCH_INTEGRAL_TYPES(
+      t_in.scalar_type(), "cub_inclusive_sum_wrapper2", ([&] {
+        AT_CUDA_CHECK(FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceScan::InclusiveSum(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            t_in.data_ptr<scalar_t>(),
+            t_out.data_ptr<scalar_t>() + 1,
+            t_in.numel(),
+            at::cuda::getCurrentCUDAStream()));
+      }));
+  return t_out;
+}
+
 using Tensor = at::Tensor;
 
 using namespace fbgemm_gpu;
@@ -227,3 +272,35 @@ transpose_embedding_input(
       sorted_linear_indices_num_runs,
       sorted_linear_indices_cumulative_run_lengths};
 }
+
+#define DEF_RADIX_SORT_PAIRS_FN(KeyT, ValueT)                        \
+  cudaError_t radix_sort_pairs(                                      \
+      void* d_temp_storage,                                          \
+      size_t& temp_storage_bytes,                                    \
+      const KeyT* d_keys_in,                                         \
+      KeyT* d_keys_out,                                              \
+      const ValueT* d_values_in,                                     \
+      ValueT* d_values_out,                                          \
+      int num_items,                                                 \
+      int begin_bit,                                                 \
+      int end_bit,                                                   \
+      cudaStream_t stream,                                           \
+      bool debug_synchronous) {                                      \
+    return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
+        d_temp_storage,                                              \
+        temp_storage_bytes,                                          \
+        d_keys_in,                                                   \
+        d_keys_out,                                                  \
+        d_values_in,                                                 \
+        d_values_out,                                                \
+        num_items,                                                   \
+        begin_bit,                                                   \
+        end_bit,                                                     \
+        stream,                                                      \
+        debug_synchronous);                                          \
+  }
+
+DEF_RADIX_SORT_PAIRS_FN(int64_t, float);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, double);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t);
+DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t);