From 0cdf078dedbf918b54d2087bc763acfe703bd6ce Mon Sep 17 00:00:00 2001 From: antoine moynault Date: Fri, 17 Nov 2023 11:14:18 +0100 Subject: [PATCH] Revert "[MicroBenchmark,LoopInterleaving] Check performance impact of Loop Interleaving Count with varying loop iterations (#26)" This reverts commit eda2d6cf2414bcbde7c77f47a72db2a8a0f06ab6. --- .../LoopVectorization/CMakeLists.txt | 9 - .../LoopVectorization/LoopInterleaving.cpp | 403 ------------------ 2 files changed, 412 deletions(-) delete mode 100644 MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt index 1c7a03eada..6bb5a2e3b1 100644 --- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt +++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt @@ -17,12 +17,3 @@ llvm_test_executable(LoopVectorizationBenchmarks ) target_link_libraries(LoopVectorizationBenchmarks benchmark) - -llvm_test_run() - -llvm_test_executable(LoopInterleavingBenchmarks - main.cpp - LoopInterleaving.cpp -) - -target_link_libraries(LoopInterleavingBenchmarks benchmark) diff --git a/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp b/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp deleted file mode 100644 index 2e8111c9a3..0000000000 --- a/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp +++ /dev/null @@ -1,403 +0,0 @@ -// This program tests performance impact of Interleaving Count with varying loop -// iteration count for different types of loops, such as loops with or -// without reductions inside it, loops with different vectorization widths. -#include -#include -#include - -#include "benchmark/benchmark.h" - -#define ELEMENTS 2048 -#define ALIGNED16 __attribute__((aligned(16))) - -static std::mt19937 rng; -unsigned int g_sum = 0; - -int A[ELEMENTS] ALIGNED16; -int B[ELEMENTS] ALIGNED16; -int C[ELEMENTS] ALIGNED16; -int D[ELEMENTS] ALIGNED16; -int E[ELEMENTS] ALIGNED16; -int F[ELEMENTS] ALIGNED16; - -// Initialize arrays with random numbers. -static void init_data(unsigned N) { - std::uniform_int_distribution distrib(std::numeric_limits::min(), - std::numeric_limits::max()); - for (unsigned I = 0; I < N; I++) { - A[I] = distrib(rng); - B[I] = distrib(rng); - C[I] = distrib(rng); - D[I] = distrib(rng); - E[I] = distrib(rng); - F[I] = distrib(rng); - } -} - -static void __attribute__((always_inline)) -runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int), - int Iterations) { - std::uniform_int_distribution distrib(std::numeric_limits::min(), - std::numeric_limits::max()); - init_data(ELEMENTS); - for (auto _ : state) { - benchmark::DoNotOptimize(A); - benchmark::DoNotOptimize(B); - benchmark::DoNotOptimize(C); - benchmark::DoNotOptimize(D); - benchmark::DoNotOptimize(E); - benchmark::DoNotOptimize(F); - benchmark::ClobberMemory(); - g_sum += Fn(Iterations); - } -} - -#define STRINGIFY(a) #a - -// Loops without Reduction with different vectorization configurations - -static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) { -#pragma clang loop unroll(disable) - for (int J = 0; J < Iterations; J++) { - A[J] = B[J] + C[J]; - } - return 0; -} - -static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) { -#pragma clang loop unroll(disable) - for (int J = 0; J < Iterations; J++) { - A[J] = B[J] + C[J]; - D[J]++; - E[J] *= 2; - F[J] /= 5; - } - return 0; -} - -#define loopNoReductionWithVecHint(vw, ic) \ - static int __attribute__((noinline)) \ - loopWithVW##vw##IC##ic(int Iterations) { \ - _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ - ic))) for (int J = 0; J < Iterations; J++) { \ - A[J] = B[J] + C[J]; \ - } \ - return 0; \ - } - -#define bigLoopNoReductionWithVecHint(vw, ic) \ - static int __attribute__((noinline)) \ - bigLoopWithVW##vw##IC##ic(int Iterations) { \ - _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ - ic))) for (int J = 0; J < Iterations; J++) { \ - A[J] = B[J] + C[J]; \ - D[J]++; \ - E[J] *= 2; \ - F[J] /= 5; \ - } \ - return 0; \ - } - -// Loops with Reduction with different vectorization configurations - -static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) { - unsigned sum = 0; -#pragma clang loop unroll(disable) - for (int J = 0; J < Iterations; J++) { - sum += A[J]; - } - return sum; -} - -static int __attribute__((noinline)) -bigLoopWithReductionAutoVec(int Iterations) { - unsigned sum = 0; -#pragma clang loop unroll(disable) - for (int J = 0; J < Iterations; J++) { - sum += A[J]; - D[J]++; - E[J] *= 2; - F[J] /= 5; - } - return sum; -} - -#define loopWithReductionWithVecHint(vw, ic) \ - static int __attribute__((noinline)) \ - loopWithReductionWithVW##vw##IC##ic(int Iterations) { \ - unsigned sum = 0; \ - _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ - ic))) for (int J = 0; J < Iterations; J++) { \ - sum += A[J]; \ - } \ - return sum; \ - } - -#define bigLoopWithReductionWithVecHint(vw, ic) \ - static int __attribute__((noinline)) \ - bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) { \ - unsigned sum = 0; \ - _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count( \ - ic))) for (int J = 0; J < Iterations; J++) { \ - sum += A[J]; \ - D[J]++; \ - E[J] *= 2; \ - F[J] /= 5; \ - } \ - return sum; \ - } - -// We are evaluating 4 types of loops for different vectorization configurations -// 1) Loops without reductions -// 2) Loops with reductions -// 3) Bigger loop bodies without reductions -// 4) Bigger loop bodies with some reductions -// For each, we are evaluating the following vectorization configurations of -// vectorization width (VW), interleaving count (IC): -// 1) automatically selected by the compiler (without vectorization hint) -// 2) VW=4, IC=1 -// 3) VW=4, IC=2 -// 4) VW=4, IC=4 -// 5) VW=1, IC=1 -// 6) VW=1, IC=2 -// 7) VW=1, IC=4 -// Of these, configurations 5-7 are skipped for loop type 1 & 3). -// Creating a function for the above configurations with different Vectorization -// Hints: -loopNoReductionWithVecHint(4, 1); -loopNoReductionWithVecHint(4, 2); -loopNoReductionWithVecHint(4, 4); -loopWithReductionWithVecHint(4, 1); -loopWithReductionWithVecHint(4, 2); -loopWithReductionWithVecHint(4, 4); -loopWithReductionWithVecHint(1, 1); -loopWithReductionWithVecHint(1, 2); -loopWithReductionWithVecHint(1, 4); -bigLoopNoReductionWithVecHint(4, 1); -bigLoopNoReductionWithVecHint(4, 2); -bigLoopNoReductionWithVecHint(4, 4); -bigLoopWithReductionWithVecHint(4, 1); -bigLoopWithReductionWithVecHint(4, 2); -bigLoopWithReductionWithVecHint(4, 4); -bigLoopWithReductionWithVecHint(1, 1); -bigLoopWithReductionWithVecHint(1, 2); -bigLoopWithReductionWithVecHint(1, 4); - -#define ADD_BENCHMARK(Itr) \ - void benchAutoVecForLoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ - } \ - BENCHMARK(benchAutoVecForLoopTC##Itr); \ - void benchForIC1VW4LoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW4LoopTC##Itr); \ - void benchForIC2VW4LoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW4LoopTC##Itr); \ - void benchForIC4VW4LoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW4LoopTC##Itr); \ - void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr); \ - } \ - BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr); \ - void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr); \ - void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr); \ - void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr); \ - void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr); \ - void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr); \ - void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr); \ - void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr); \ - } \ - BENCHMARK(benchAutoVecForBigLoopTC##Itr); \ - void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW4BigLoopTC##Itr); \ - void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW4BigLoopTC##Itr); \ - void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW4BigLoopTC##Itr); \ - void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr); \ - } \ - BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr); \ - void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr); \ - void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr); \ - void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr); \ - void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr); \ - } \ - BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr); \ - void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr); \ - } \ - BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr); \ - void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) { \ - runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr); \ - } \ - BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr); - -ADD_BENCHMARK(1) -ADD_BENCHMARK(2) -ADD_BENCHMARK(3) -ADD_BENCHMARK(4) -ADD_BENCHMARK(5) -ADD_BENCHMARK(6) -ADD_BENCHMARK(7) -ADD_BENCHMARK(8) -ADD_BENCHMARK(9) -ADD_BENCHMARK(10) -ADD_BENCHMARK(11) -ADD_BENCHMARK(12) -ADD_BENCHMARK(13) -ADD_BENCHMARK(14) -ADD_BENCHMARK(15) -ADD_BENCHMARK(16) -ADD_BENCHMARK(17) -ADD_BENCHMARK(18) -ADD_BENCHMARK(19) -ADD_BENCHMARK(20) -ADD_BENCHMARK(21) -ADD_BENCHMARK(22) -ADD_BENCHMARK(23) -ADD_BENCHMARK(24) -ADD_BENCHMARK(25) -ADD_BENCHMARK(26) -ADD_BENCHMARK(27) -ADD_BENCHMARK(28) -ADD_BENCHMARK(29) -ADD_BENCHMARK(30) -ADD_BENCHMARK(31) -ADD_BENCHMARK(32) -ADD_BENCHMARK(33) -ADD_BENCHMARK(34) -ADD_BENCHMARK(35) -ADD_BENCHMARK(36) -ADD_BENCHMARK(37) -ADD_BENCHMARK(38) -ADD_BENCHMARK(39) -ADD_BENCHMARK(40) -ADD_BENCHMARK(41) -ADD_BENCHMARK(42) -ADD_BENCHMARK(43) -ADD_BENCHMARK(44) -ADD_BENCHMARK(45) -ADD_BENCHMARK(46) -ADD_BENCHMARK(47) -ADD_BENCHMARK(48) -ADD_BENCHMARK(49) -ADD_BENCHMARK(50) -ADD_BENCHMARK(51) -ADD_BENCHMARK(52) -ADD_BENCHMARK(53) -ADD_BENCHMARK(54) -ADD_BENCHMARK(55) -ADD_BENCHMARK(56) -ADD_BENCHMARK(57) -ADD_BENCHMARK(58) -ADD_BENCHMARK(59) -ADD_BENCHMARK(60) -ADD_BENCHMARK(61) -ADD_BENCHMARK(62) -ADD_BENCHMARK(63) -ADD_BENCHMARK(64) -ADD_BENCHMARK(65) -ADD_BENCHMARK(66) -ADD_BENCHMARK(67) -ADD_BENCHMARK(68) -ADD_BENCHMARK(69) -ADD_BENCHMARK(70) -ADD_BENCHMARK(71) -ADD_BENCHMARK(72) -ADD_BENCHMARK(73) -ADD_BENCHMARK(74) -ADD_BENCHMARK(75) -ADD_BENCHMARK(76) -ADD_BENCHMARK(77) -ADD_BENCHMARK(78) -ADD_BENCHMARK(79) -ADD_BENCHMARK(80) -ADD_BENCHMARK(81) -ADD_BENCHMARK(82) -ADD_BENCHMARK(83) -ADD_BENCHMARK(84) -ADD_BENCHMARK(85) -ADD_BENCHMARK(86) -ADD_BENCHMARK(87) -ADD_BENCHMARK(88) -ADD_BENCHMARK(89) -ADD_BENCHMARK(90) -ADD_BENCHMARK(91) -ADD_BENCHMARK(92) -ADD_BENCHMARK(93) -ADD_BENCHMARK(94) -ADD_BENCHMARK(95) -ADD_BENCHMARK(96) -ADD_BENCHMARK(97) -ADD_BENCHMARK(98) -ADD_BENCHMARK(99) -ADD_BENCHMARK(100) -ADD_BENCHMARK(101) -ADD_BENCHMARK(102) -ADD_BENCHMARK(103) -ADD_BENCHMARK(104) -ADD_BENCHMARK(105) -ADD_BENCHMARK(106) -ADD_BENCHMARK(107) -ADD_BENCHMARK(108) -ADD_BENCHMARK(109) -ADD_BENCHMARK(110) -ADD_BENCHMARK(111) -ADD_BENCHMARK(112) -ADD_BENCHMARK(113) -ADD_BENCHMARK(114) -ADD_BENCHMARK(115) -ADD_BENCHMARK(116) -ADD_BENCHMARK(117) -ADD_BENCHMARK(118) -ADD_BENCHMARK(119) -ADD_BENCHMARK(120) -ADD_BENCHMARK(121) -ADD_BENCHMARK(122) -ADD_BENCHMARK(123) -ADD_BENCHMARK(124) -ADD_BENCHMARK(125) -ADD_BENCHMARK(126) -ADD_BENCHMARK(127) -ADD_BENCHMARK(128)