100%|███████████████████████████████████████████████████████████████████████████████████████| 22/22 [01:20<00:00, 3.66s/it]

drisspg · drisspg · commit 67c596eed43d · 2024-02-09T18:20:24.000-08:00
num_rows    num_cols  high_precision_dtype    low_precision_dtype      cuda_time    pytorch_time    compiled_pytorch_time
----------  ----------  ----------------------  ---------------------  -----------  --------------  -----------------------
       512         512  torch.bfloat16          torch.float8_e4m3fn        5.16311         38.2956                  17.9364
       512         512  torch.bfloat16          torch.float8_e5m2          5.04093         37.7555                  17.99
      1024        1024  torch.bfloat16          torch.float8_e4m3fn        5.15543         38.1199                  58.628
      1024        1024  torch.bfloat16          torch.float8_e5m2          5.13135         38.415                   58.9279
      2048        2048  torch.bfloat16          torch.float8_e4m3fn        6.82004         69.8858                  20.4935
      2048        2048  torch.bfloat16          torch.float8_e5m2          6.82519         70.1765                  20.4055
      1024        8192  torch.bfloat16          torch.float8_e4m3fn       11.5463         153.481                   20.5978
      1024        8192  torch.bfloat16          torch.float8_e5m2         11.5646         153.569                   20.5893
      8192        1280  torch.bfloat16          torch.float8_e4m3fn       13.8572         209.535                   20.4999
      8192        1280  torch.bfloat16          torch.float8_e5m2         13.8607         209.533                   20.3462
      8192        7168  torch.bfloat16          torch.float8_e4m3fn       81.5091        1177.47                    85.1974
      8192        7168  torch.bfloat16          torch.float8_e5m2         81.7804        1177.95                    85.2696
      3584        8192  torch.bfloat16          torch.float8_e4m3fn       43.1506         605.872                   44.7317
      3584        8192  torch.bfloat16          torch.float8_e5m2         43.1619         606.259                   44.725
      2048      109760  torch.bfloat16          torch.float8_e4m3fn      299.782         4407.9                    319.255
      2048      109760  torch.bfloat16          torch.float8_e5m2        299.757         4408.04                   320.204
         1        3232  torch.bfloat16          torch.float8_e4m3fn        5.01986         38.1827                  58.037
         1        3232  torch.bfloat16          torch.float8_e5m2          5.05898         38.0519                  58.6179
      2048           1  torch.bfloat16          torch.float8_e4m3fn        5.00403         37.8273                  57.656
      2048           1  torch.bfloat16          torch.float8_e5m2          5.05094         38.2455                  57.776
     14144        2048  torch.bfloat16          torch.float8_e4m3fn       42.632          598.726                   44.2741
     14144        2048  torch.bfloat16          torch.float8_e5m2         42.6492         598.401                   44.2601
diff --git a/benchmarks/benchmark_saturated_casting.py b/benchmarks/benchmark_saturated_casting.py
@@ -73,6 +73,10 @@ def get_configs() -> List[ExperimentConfig]:
         (8192, 1280),
         (8192, 7168),
         (3584, 8192),
+        (2048, 109760),
+        (1, 3232),
+        (2048, 1),
+        (14144, 2048),
     ]
     for (rows, cols), high_precision_dtype, low_precision_dtype in itertools.product(
         num_rows_cols, high_precision_dtypes, low_precision_dtypes
diff --git a/src/saturated_cast.cu b/src/saturated_cast.cu
@@ -28,7 +28,41 @@ __global__ void saturated_cast_kernel_single(
   }
 }
 
-template<int coarse_factor>
+template <int coarse_factor>
+__global__ void saturated_cast_kernel_double_coalesced_flat(
+    nv_bfloat162 const *__restrict input,
+    __nv_fp8x2_storage_t *__restrict output, const int numels,
+    __nv_fp8_interpretation_t out_dtype, nv_bfloat16 const *scaler) {
+  const int idx = (blockIdx.x * blockDim.x + threadIdx.x) * coarse_factor;
+  const int stride = 1;
+  const nv_bfloat162 scale_2 = {(*scaler), (*scaler)};
+
+  nv_bfloat162 scaled_inputs[coarse_factor];
+#pragma unroll
+  for (int i{0}; i < coarse_factor; ++i) {
+    const int temp_idx = idx + i;
+    if (temp_idx < numels) {
+      scaled_inputs[i] = input[temp_idx * stride];
+    }
+  }
+#pragma unroll
+  for (int i{0}; i < coarse_factor; ++i) {
+    const int temp_idx = idx + i;
+    if (temp_idx < numels) {
+      scaled_inputs[i] = __hmul2(scaled_inputs[i], scale_2);
+    }
+  }
+#pragma unroll
+  for (int i{0}; i < coarse_factor; ++i) {
+    const int temp_idx = idx + i;
+    if (temp_idx < numels) {
+      output[temp_idx * stride] = __nv_cvt_bfloat16raw2_to_fp8x2(
+          scaled_inputs[i], __nv_saturation_t::__NV_SATFINITE, out_dtype);
+    }
+  }
+}
+
+template <int coarse_factor>
 __global__ void saturated_cast_kernel_double_coalesced(
     nv_bfloat162 const *__restrict input,
     __nv_fp8x2_storage_t *__restrict output, int n_rows, int n_cols,
@@ -59,9 +93,8 @@ __global__ void saturated_cast_kernel_double_coalesced(
     const int temp_col = col + i;
     if (row < n_rows && temp_col < n_cols) {
       output[row * row_stride + temp_col * col_stride] =
-          __nv_cvt_bfloat16raw2_to_fp8x2(scaled_inputs[i],
-                                         __nv_saturation_t::__NV_SATFINITE,
-                                         out_dtype);
+          __nv_cvt_bfloat16raw2_to_fp8x2(
+              scaled_inputs[i], __nv_saturation_t::__NV_SATFINITE, out_dtype);
     }
   }
 }
@@ -84,8 +117,26 @@ void dispatch_best_kernel(const Tensor &input, const Tensor &output,
   const int n_cols = input.size(1);
   const int block_size_x = 32;
   const int block_size_y = 32;
-  if (n_cols % 2 == 0) {
-    // We cast to a 2x8 type, so we need to divide the number of columns by 2
+  const auto numel = input.numel();
+  int kernel_choice = 0;
+  if (numel % 2 == 0 && !transpose) {
+    kernel_choice = 2;
+  } else if (n_cols % 2 == 0) {
+    kernel_choice = 1;
+  }
+  switch (kernel_choice) {
+  case 0: {
+    const dim3 block(block_size_x, block_size_y);
+    const dim3 grid(ceil_div(n_cols, block_size_x),
+                    ceil_div(n_rows, block_size_y));
+    saturated_cast_kernel_single<<<grid, block>>>(
+        static_cast<nv_bfloat16 *>(input.data_ptr()),
+        static_cast<__nv_fp8_storage_t *>(output.data_ptr()), n_rows, n_cols,
+        out_dtype, static_cast<nv_bfloat16 *>(scale.data_ptr()));
+    break;
+  }
+  case 1: {
+    // / We cast to a 16x2 type, so we need to divide the number of columns by 2
     const auto packed_col_size = n_cols / 2;
     // Found 4 to be the best factor for the coalesced kernel
     const int coarse_factor = 4;
@@ -97,14 +148,20 @@ void dispatch_best_kernel(const Tensor &input, const Tensor &output,
         static_cast<__nv_fp8x2_storage_t *>(output.data_ptr()), n_rows,
         packed_col_size, out_dtype,
         static_cast<nv_bfloat16 *>(scale.data_ptr()));
-  } else {
-    const dim3 block(block_size_x, block_size_y);
-    const dim3 grid(ceil_div(n_cols, block_size_x),
-                    ceil_div(n_rows, block_size_y));
-    saturated_cast_kernel_single<<<grid, block>>>(
-        static_cast<nv_bfloat16 *>(input.data_ptr()),
-        static_cast<__nv_fp8_storage_t *>(output.data_ptr()), n_rows, n_cols,
+    break;
+  }
+  case 2: {
+    const int coarse_factor = 4;
+    const dim3 block(256);
+    const int packed_numel = numel / 2;
+    // We divide numel by 2 because we are casting to a 16x2 type
+    const dim3 grid(ceil_div(packed_numel, block.x * coarse_factor));
+    saturated_cast_kernel_double_coalesced_flat<coarse_factor><<<grid, block>>>(
+        static_cast<nv_bfloat162 *>(input.data_ptr()),
+        static_cast<__nv_fp8x2_storage_t *>(output.data_ptr()), packed_numel,
         out_dtype, static_cast<nv_bfloat16 *>(scale.data_ptr()));
+    break;
+  }
   }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }