[libc][gpu] Add Atan2 Benchmarks (#104708)

jameshu15869 · web-flow · commit deb6b45c3268 · 2024-08-18T12:50:30.000-05:00
This PR adds benchmarking for `atan2()`, `__nv_atan2()`, and
`__ocml_atan2_f64()` using the same setup as `sin()`. This PR also adds
support for throughout bencmarking for functions with 2 inputs.
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
   cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
   LIBC_NAMESPACE::printf(
-      "%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
+      "%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
       b->get_test_name().data(), result.cycles, result.min, result.max,
       result.total_iterations, result.total_time, time_unit,
       static_cast<uint64_t>(result.standard_deviation), num_threads);
@@ -127,7 +127,7 @@ void print_header() {
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
   cpp::string titles =
-      "Benchmark            |  Cycles |     Min |     Max | "
+      "Benchmark                |  Cycles |     Min |     Max | "
       "Iterations | Time / Iteration |   Stddev |  Threads |\n";
   LIBC_NAMESPACE::printf(titles.data());
 
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -146,10 +146,8 @@ template <typename T> class MathPerf {
       cpp::numeric_limits<StorageType>::max();
 
 public:
-  typedef T Func(T);
-
   template <size_t N = 1>
-  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+  static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
     cpp::array<T, N> inputs;
     for (size_t i = 0; i < N; ++i)
       inputs[i] = get_rand_input<T>(min_exp, max_exp);
@@ -158,6 +156,23 @@ template <typename T> class MathPerf {
 
     return total_time / N;
   }
+
+  // Throughput benchmarking for functions that take 2 inputs.
+  template <size_t N = 1>
+  static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
+                                          int arg1_max_exp, int arg2_min_exp,
+                                          int arg2_max_exp) {
+    cpp::array<T, N> inputs1;
+    cpp::array<T, N> inputs2;
+    for (size_t i = 0; i < N; ++i) {
+      inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
+      inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
+    }
+
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
+
+    return total_time / N;
+  }
 };
 
 } // namespace benchmarks
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -43,3 +43,22 @@ add_benchmark(
   LOADER_ARGS
     --threads 64
 )
+
+add_benchmark(
+  atan2_benchmark
+  SUITE
+    libc-gpu-math-benchmarks
+  SRCS
+    atan2_benchmark.cpp
+  DEPENDS
+    libc.src.math.atan2
+    libc.src.stdlib.srand
+    libc.src.stdlib.rand
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.array
+  COMPILE_OPTIONS
+    ${math_benchmark_flags}
+  LOADER_ARGS
+    --threads 64
+)
diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@@ -0,0 +1,47 @@
+#include "benchmarks/gpu/LibcGpuBenchmark.h"
+
+#include "src/math/atan2.h"
+#include "src/stdlib/rand.h"
+
+#ifdef NVPTX_MATH_FOUND
+#include "src/math/nvptx/declarations.h"
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+#include "src/math/amdgpu/declarations.h"
+#endif
+
+#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
+  []() {                                                                       \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
+        N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP);                          \
+  }
+
+#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1,                   \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128,                 \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));  \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024,                \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096,                \
+                        BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
+BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
+BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
+BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
+
+#ifdef NVPTX_MATH_FOUND
+BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+#endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   return stop - start;
 }
 
+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"v"(&inputs1), "v"(&inputs2));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  for (size_t i = 0; i < inputs1.size(); i++) {
+    auto result = f(inputs1[i], inputs2[i]);
+
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -121,6 +121,32 @@ throughput(F f, const cpp::array<T, N> &inputs) {
   // Return the time elapsed.
   return stop - start;
 }
+
+// Provides throughput benchmarking for 2 arguments (e.g. atan2())
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
+    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
+  asm("" ::"r"(&inputs1), "r"(&inputs2));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  uint64_t result;
+  for (size_t i = 0; i < inputs1.size(); i++) {
+    result = f(inputs1[i], inputs2[i]);
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX