Skip to content

Commit deb6b45

Browse files
authored
[libc][gpu] Add Atan2 Benchmarks (#104708)
This PR adds benchmarking for `atan2()`, `__nv_atan2()`, and `__ocml_atan2_f64()` using the same setup as `sin()`. This PR also adds support for throughout bencmarking for functions with 2 inputs.
1 parent 5c13f9a commit deb6b45

File tree

6 files changed

+137
-5
lines changed

6 files changed

+137
-5
lines changed

libc/benchmarks/gpu/LibcGpuBenchmark.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
115115
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
116116

117117
LIBC_NAMESPACE::printf(
118-
"%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
118+
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
119119
b->get_test_name().data(), result.cycles, result.min, result.max,
120120
result.total_iterations, result.total_time, time_unit,
121121
static_cast<uint64_t>(result.standard_deviation), num_threads);
@@ -127,7 +127,7 @@ void print_header() {
127127
benchmarks[0]->get_suite_name().data());
128128
LIBC_NAMESPACE::printf("%s", RESET);
129129
cpp::string titles =
130-
"Benchmark | Cycles | Min | Max | "
130+
"Benchmark | Cycles | Min | Max | "
131131
"Iterations | Time / Iteration | Stddev | Threads |\n";
132132
LIBC_NAMESPACE::printf(titles.data());
133133

libc/benchmarks/gpu/LibcGpuBenchmark.h

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,8 @@ template <typename T> class MathPerf {
146146
cpp::numeric_limits<StorageType>::max();
147147

148148
public:
149-
typedef T Func(T);
150-
151149
template <size_t N = 1>
152-
static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
150+
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
153151
cpp::array<T, N> inputs;
154152
for (size_t i = 0; i < N; ++i)
155153
inputs[i] = get_rand_input<T>(min_exp, max_exp);
@@ -158,6 +156,23 @@ template <typename T> class MathPerf {
158156

159157
return total_time / N;
160158
}
159+
160+
// Throughput benchmarking for functions that take 2 inputs.
161+
template <size_t N = 1>
162+
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
163+
int arg1_max_exp, int arg2_min_exp,
164+
int arg2_max_exp) {
165+
cpp::array<T, N> inputs1;
166+
cpp::array<T, N> inputs2;
167+
for (size_t i = 0; i < N; ++i) {
168+
inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
169+
inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
170+
}
171+
172+
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
173+
174+
return total_time / N;
175+
}
161176
};
162177

163178
} // namespace benchmarks

libc/benchmarks/gpu/src/math/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,22 @@ add_benchmark(
4343
LOADER_ARGS
4444
--threads 64
4545
)
46+
47+
add_benchmark(
48+
atan2_benchmark
49+
SUITE
50+
libc-gpu-math-benchmarks
51+
SRCS
52+
atan2_benchmark.cpp
53+
DEPENDS
54+
libc.src.math.atan2
55+
libc.src.stdlib.srand
56+
libc.src.stdlib.rand
57+
libc.src.__support.FPUtil.fp_bits
58+
libc.src.__support.CPP.bit
59+
libc.src.__support.CPP.array
60+
COMPILE_OPTIONS
61+
${math_benchmark_flags}
62+
LOADER_ARGS
63+
--threads 64
64+
)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include "benchmarks/gpu/LibcGpuBenchmark.h"
2+
3+
#include "src/math/atan2.h"
4+
#include "src/stdlib/rand.h"
5+
6+
#ifdef NVPTX_MATH_FOUND
7+
#include "src/math/nvptx/declarations.h"
8+
#endif
9+
10+
#ifdef AMDGPU_MATH_FOUND
11+
#include "src/math/amdgpu/declarations.h"
12+
#endif
13+
14+
#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
15+
[]() { \
16+
return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
17+
N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \
18+
}
19+
20+
#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
21+
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \
22+
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
23+
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \
24+
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
25+
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \
26+
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
27+
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \
28+
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
29+
30+
BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
31+
BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
32+
BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
33+
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
34+
35+
#ifdef NVPTX_MATH_FOUND
36+
BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
37+
BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
38+
BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
39+
BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
40+
#endif
41+
42+
#ifdef AMDGPU_MATH_FOUND
43+
BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
44+
BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
45+
BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
46+
BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
47+
#endif

libc/benchmarks/gpu/timing/amdgpu/timing.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
130130
return stop - start;
131131
}
132132

133+
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
134+
template <typename F, typename T, size_t N>
135+
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
136+
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
137+
asm("" ::"v"(&inputs1), "v"(&inputs2));
138+
139+
gpu::memory_fence();
140+
uint64_t start = gpu::processor_clock();
141+
142+
asm("" ::"s"(start));
143+
144+
for (size_t i = 0; i < inputs1.size(); i++) {
145+
auto result = f(inputs1[i], inputs2[i]);
146+
147+
asm("" ::"v"(result));
148+
}
149+
150+
uint64_t stop = gpu::processor_clock();
151+
asm("" ::"s"(stop));
152+
gpu::memory_fence();
153+
154+
// Return the time elapsed.
155+
return stop - start;
156+
}
157+
133158
} // namespace LIBC_NAMESPACE_DECL
134159

135160
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

libc/benchmarks/gpu/timing/nvptx/timing.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,32 @@ throughput(F f, const cpp::array<T, N> &inputs) {
121121
// Return the time elapsed.
122122
return stop - start;
123123
}
124+
125+
// Provides throughput benchmarking for 2 arguments (e.g. atan2())
126+
template <typename F, typename T, size_t N>
127+
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
128+
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
129+
asm("" ::"r"(&inputs1), "r"(&inputs2));
130+
131+
gpu::memory_fence();
132+
uint64_t start = gpu::processor_clock();
133+
134+
asm("" ::"llr"(start));
135+
136+
uint64_t result;
137+
for (size_t i = 0; i < inputs1.size(); i++) {
138+
result = f(inputs1[i], inputs2[i]);
139+
asm("" ::"r"(result));
140+
}
141+
142+
uint64_t stop = gpu::processor_clock();
143+
gpu::memory_fence();
144+
asm("" ::"r"(stop));
145+
volatile auto output = result;
146+
147+
// Return the time elapsed.
148+
return stop - start;
149+
}
124150
} // namespace LIBC_NAMESPACE_DECL
125151

126152
#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

0 commit comments

Comments
 (0)