Skip to content

[libc][gpu] Add Atan2 Benchmarks #104708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions libc/benchmarks/gpu/LibcGpuBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ void print_results(Benchmark *b) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);

LIBC_NAMESPACE::printf(
"%-20s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
"%-24s |%8ld |%8ld |%8ld |%11d |%14ld %2s |%9ld |%9d |\n",
b->get_test_name().data(), result.cycles, result.min, result.max,
result.total_iterations, result.total_time, time_unit,
static_cast<uint64_t>(result.standard_deviation), num_threads);
Expand All @@ -127,7 +127,7 @@ void print_header() {
benchmarks[0]->get_suite_name().data());
LIBC_NAMESPACE::printf("%s", RESET);
cpp::string titles =
"Benchmark | Cycles | Min | Max | "
"Benchmark | Cycles | Min | Max | "
"Iterations | Time / Iteration | Stddev | Threads |\n";
LIBC_NAMESPACE::printf(titles.data());

Expand Down
21 changes: 18 additions & 3 deletions libc/benchmarks/gpu/LibcGpuBenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,8 @@ template <typename T> class MathPerf {
cpp::numeric_limits<StorageType>::max();

public:
typedef T Func(T);

template <size_t N = 1>
static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing you needed the f(T) so overloading worked?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, using the typedef meant it couldn't take in both types of functions

cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);
Expand All @@ -158,6 +156,23 @@ template <typename T> class MathPerf {

return total_time / N;
}

// Throughput benchmarking for functions that take 2 inputs.
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
int arg2_max_exp) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
for (size_t i = 0; i < N; ++i) {
inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
}

uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);

return total_time / N;
}
};

} // namespace benchmarks
Expand Down
19 changes: 19 additions & 0 deletions libc/benchmarks/gpu/src/math/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,22 @@ add_benchmark(
LOADER_ARGS
--threads 64
)

add_benchmark(
atan2_benchmark
SUITE
libc-gpu-math-benchmarks
SRCS
atan2_benchmark.cpp
DEPENDS
libc.src.math.atan2
libc.src.stdlib.srand
libc.src.stdlib.rand
libc.src.__support.FPUtil.fp_bits
libc.src.__support.CPP.bit
libc.src.__support.CPP.array
COMPILE_OPTIONS
${math_benchmark_flags}
LOADER_ARGS
--threads 64
)
47 changes: 47 additions & 0 deletions libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "benchmarks/gpu/LibcGpuBenchmark.h"

#include "src/math/atan2.h"
#include "src/stdlib/rand.h"

#ifdef NVPTX_MATH_FOUND
#include "src/math/nvptx/declarations.h"
#endif

#ifdef AMDGPU_MATH_FOUND
#include "src/math/amdgpu/declarations.h"
#endif

#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
[]() { \
return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range< \
N>(Func, MIN_EXP, MAX_EXP, MIN_EXP, MAX_EXP); \
}

#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP) \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1, \
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_128, \
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_1024, \
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024)); \
SINGLE_WAVE_BENCHMARK(LlvmLibcAtan2GpuBenchmark, Name##_4096, \
BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))

BENCH(double, Atan2, LIBC_NAMESPACE::atan2, -1023, 1023);
BENCH(double, Atan2TwoPi, LIBC_NAMESPACE::atan2, -10, 3);
BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);

#ifdef NVPTX_MATH_FOUND
BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
#endif

#ifdef AMDGPU_MATH_FOUND
BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
#endif
25 changes: 25 additions & 0 deletions libc/benchmarks/gpu/timing/amdgpu/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,31 @@ throughput(F f, const cpp::array<T, N> &inputs) {
return stop - start;
}

// Provides throughput benchmarking for 2 arguments (e.g. atan2())
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
asm("" ::"v"(&inputs1), "v"(&inputs2));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"s"(start));

for (size_t i = 0; i < inputs1.size(); i++) {
auto result = f(inputs1[i], inputs2[i]);

asm("" ::"v"(result));
}

uint64_t stop = gpu::processor_clock();
asm("" ::"s"(stop));
gpu::memory_fence();

// Return the time elapsed.
return stop - start;
}

} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
26 changes: 26 additions & 0 deletions libc/benchmarks/gpu/timing/nvptx/timing.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,32 @@ throughput(F f, const cpp::array<T, N> &inputs) {
// Return the time elapsed.
return stop - start;
}

// Provides throughput benchmarking for 2 arguments (e.g. atan2())
template <typename F, typename T, size_t N>
[[gnu::noinline]] static LIBC_INLINE uint64_t throughput(
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
asm("" ::"r"(&inputs1), "r"(&inputs2));

gpu::memory_fence();
uint64_t start = gpu::processor_clock();

asm("" ::"llr"(start));

uint64_t result;
for (size_t i = 0; i < inputs1.size(); i++) {
result = f(inputs1[i], inputs2[i]);
asm("" ::"r"(result));
}

uint64_t stop = gpu::processor_clock();
gpu::memory_fence();
asm("" ::"r"(stop));
volatile auto output = result;

// Return the time elapsed.
return stop - start;
}
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
Loading