From 0cdf078dedbf918b54d2087bc763acfe703bd6ce Mon Sep 17 00:00:00 2001
From: antoine moynault <antoine.moynault@gmail.com>
Date: Fri, 17 Nov 2023 11:14:18 +0100
Subject: [PATCH] Revert "[MicroBenchmark,LoopInterleaving] Check performance
 impact of Loop Interleaving Count with varying loop iterations (#26)"

This reverts commit eda2d6cf2414bcbde7c77f47a72db2a8a0f06ab6.
---
 .../LoopVectorization/CMakeLists.txt          |   9 -
 .../LoopVectorization/LoopInterleaving.cpp    | 403 ------------------
 2 files changed, 412 deletions(-)
 delete mode 100644 MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp
diff --git a/MicroBenchmarks/LoopVectorization/CMakeLists.txt b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
index 1c7a03eada..6bb5a2e3b1 100644
--- a/MicroBenchmarks/LoopVectorization/CMakeLists.txt
+++ b/MicroBenchmarks/LoopVectorization/CMakeLists.txt
@@ -17,12 +17,3 @@ llvm_test_executable(LoopVectorizationBenchmarks
 )
 
 target_link_libraries(LoopVectorizationBenchmarks benchmark)
-
-llvm_test_run()
-
-llvm_test_executable(LoopInterleavingBenchmarks
-  main.cpp
-  LoopInterleaving.cpp
-)
-
-target_link_libraries(LoopInterleavingBenchmarks benchmark)
diff --git a/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp b/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp
deleted file mode 100644
index 2e8111c9a3..0000000000
--- a/MicroBenchmarks/LoopVectorization/LoopInterleaving.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-// This program tests performance impact of Interleaving Count with varying loop
-// iteration count for different types of loops, such as loops with or
-// without reductions inside it, loops with different vectorization widths.
-#include <iostream>
-#include <memory>
-#include <random>
-
-#include "benchmark/benchmark.h"
-
-#define ELEMENTS 2048
-#define ALIGNED16 __attribute__((aligned(16)))
-
-static std::mt19937 rng;
-unsigned int g_sum = 0;
-
-int A[ELEMENTS] ALIGNED16;
-int B[ELEMENTS] ALIGNED16;
-int C[ELEMENTS] ALIGNED16;
-int D[ELEMENTS] ALIGNED16;
-int E[ELEMENTS] ALIGNED16;
-int F[ELEMENTS] ALIGNED16;
-
-// Initialize arrays with random numbers.
-static void init_data(unsigned N) {
-  std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
-                                             std::numeric_limits<int>::max());
-  for (unsigned I = 0; I < N; I++) {
-    A[I] = distrib(rng);
-    B[I] = distrib(rng);
-    C[I] = distrib(rng);
-    D[I] = distrib(rng);
-    E[I] = distrib(rng);
-    F[I] = distrib(rng);
-  }
-}
-
-static void __attribute__((always_inline))
-runBenchForLoopInterleaving(benchmark::State &state, int (*Fn)(int),
-                            int Iterations) {
-  std::uniform_int_distribution<int> distrib(std::numeric_limits<int>::min(),
-                                             std::numeric_limits<int>::max());
-  init_data(ELEMENTS);
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(A);
-    benchmark::DoNotOptimize(B);
-    benchmark::DoNotOptimize(C);
-    benchmark::DoNotOptimize(D);
-    benchmark::DoNotOptimize(E);
-    benchmark::DoNotOptimize(F);
-    benchmark::ClobberMemory();
-    g_sum += Fn(Iterations);
-  }
-}
-
-#define STRINGIFY(a) #a
-
-// Loops without Reduction with different vectorization configurations
-
-static int __attribute__((noinline)) loopNoReductionAutoVec(int Iterations) {
-#pragma clang loop unroll(disable)
-  for (int J = 0; J < Iterations; J++) {
-    A[J] = B[J] + C[J];
-  }
-  return 0;
-}
-
-static int __attribute__((noinline)) bigLoopNoReductionAutoVec(int Iterations) {
-#pragma clang loop unroll(disable)
-  for (int J = 0; J < Iterations; J++) {
-    A[J] = B[J] + C[J];
-    D[J]++;
-    E[J] *= 2;
-    F[J] /= 5;
-  }
-  return 0;
-}
-
-#define loopNoReductionWithVecHint(vw, ic)                                     \
-  static int __attribute__((noinline))                                         \
-  loopWithVW##vw##IC##ic(int Iterations) {                                     \
-    _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
-        ic))) for (int J = 0; J < Iterations; J++) {                           \
-      A[J] = B[J] + C[J];                                                      \
-    }                                                                          \
-    return 0;                                                                  \
-  }
-
-#define bigLoopNoReductionWithVecHint(vw, ic)                                  \
-  static int __attribute__((noinline))                                         \
-  bigLoopWithVW##vw##IC##ic(int Iterations) {                                  \
-    _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
-        ic))) for (int J = 0; J < Iterations; J++) {                           \
-      A[J] = B[J] + C[J];                                                      \
-      D[J]++;                                                                  \
-      E[J] *= 2;                                                               \
-      F[J] /= 5;                                                               \
-    }                                                                          \
-    return 0;                                                                  \
-  }
-
-// Loops with Reduction with different vectorization configurations
-
-static int __attribute__((noinline)) loopWithReductionAutoVec(int Iterations) {
-  unsigned sum = 0;
-#pragma clang loop unroll(disable)
-  for (int J = 0; J < Iterations; J++) {
-    sum += A[J];
-  }
-  return sum;
-}
-
-static int __attribute__((noinline))
-bigLoopWithReductionAutoVec(int Iterations) {
-  unsigned sum = 0;
-#pragma clang loop unroll(disable)
-  for (int J = 0; J < Iterations; J++) {
-    sum += A[J];
-    D[J]++;
-    E[J] *= 2;
-    F[J] /= 5;
-  }
-  return sum;
-}
-
-#define loopWithReductionWithVecHint(vw, ic)                                   \
-  static int __attribute__((noinline))                                         \
-  loopWithReductionWithVW##vw##IC##ic(int Iterations) {                        \
-    unsigned sum = 0;                                                          \
-    _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
-        ic))) for (int J = 0; J < Iterations; J++) {                           \
-      sum += A[J];                                                             \
-    }                                                                          \
-    return sum;                                                                \
-  }
-
-#define bigLoopWithReductionWithVecHint(vw, ic)                                \
-  static int __attribute__((noinline))                                         \
-  bigLoopWithReductionWithVW##vw##IC##ic(int Iterations) {                     \
-    unsigned sum = 0;                                                          \
-    _Pragma(STRINGIFY(clang loop vectorize_width(vw) interleave_count(         \
-        ic))) for (int J = 0; J < Iterations; J++) {                           \
-      sum += A[J];                                                             \
-      D[J]++;                                                                  \
-      E[J] *= 2;                                                               \
-      F[J] /= 5;                                                               \
-    }                                                                          \
-    return sum;                                                                \
-  }
-
-// We are evaluating 4 types of loops for different vectorization configurations
-// 1) Loops without reductions
-// 2) Loops with reductions
-// 3) Bigger loop bodies without reductions
-// 4) Bigger loop bodies with some reductions
-// For each, we are evaluating the following vectorization configurations of
-// vectorization width (VW), interleaving count (IC):
-// 1) automatically selected by the compiler (without vectorization hint)
-// 2) VW=4, IC=1
-// 3) VW=4, IC=2
-// 4) VW=4, IC=4
-// 5) VW=1, IC=1
-// 6) VW=1, IC=2
-// 7) VW=1, IC=4
-// Of these, configurations 5-7 are skipped for loop type 1 & 3).
-// Creating a function for the above configurations with different Vectorization
-// Hints:
-loopNoReductionWithVecHint(4, 1);
-loopNoReductionWithVecHint(4, 2);
-loopNoReductionWithVecHint(4, 4);
-loopWithReductionWithVecHint(4, 1);
-loopWithReductionWithVecHint(4, 2);
-loopWithReductionWithVecHint(4, 4);
-loopWithReductionWithVecHint(1, 1);
-loopWithReductionWithVecHint(1, 2);
-loopWithReductionWithVecHint(1, 4);
-bigLoopNoReductionWithVecHint(4, 1);
-bigLoopNoReductionWithVecHint(4, 2);
-bigLoopNoReductionWithVecHint(4, 4);
-bigLoopWithReductionWithVecHint(4, 1);
-bigLoopWithReductionWithVecHint(4, 2);
-bigLoopWithReductionWithVecHint(4, 4);
-bigLoopWithReductionWithVecHint(1, 1);
-bigLoopWithReductionWithVecHint(1, 2);
-bigLoopWithReductionWithVecHint(1, 4);
-
-#define ADD_BENCHMARK(Itr)                                                     \
-  void benchAutoVecForLoopTC##Itr(benchmark::State &state) {                   \
-    runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr);          \
-  }                                                                            \
-  BENCHMARK(benchAutoVecForLoopTC##Itr);                                       \
-  void benchForIC1VW4LoopTC##Itr(benchmark::State &state) {                    \
-    runBenchForLoopInterleaving(state, &loopWithVW4IC1, Itr);                  \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW4LoopTC##Itr);                                        \
-  void benchForIC2VW4LoopTC##Itr(benchmark::State &state) {                    \
-    runBenchForLoopInterleaving(state, &loopWithVW4IC2, Itr);                  \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW4LoopTC##Itr);                                        \
-  void benchForIC4VW4LoopTC##Itr(benchmark::State &state) {                    \
-    runBenchForLoopInterleaving(state, &loopWithVW4IC4, Itr);                  \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW4LoopTC##Itr);                                        \
-  void benchForLoopWithReductionAutoVecTC##Itr(benchmark::State &state) {      \
-    runBenchForLoopInterleaving(state, &loopWithReductionAutoVec, Itr);        \
-  }                                                                            \
-  BENCHMARK(benchForLoopWithReductionAutoVecTC##Itr);                          \
-  void benchForIC1VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC1, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW4LoopWithReductionTC##Itr);                           \
-  void benchForIC2VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC2, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW4LoopWithReductionTC##Itr);                           \
-  void benchForIC4VW4LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW4IC4, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW4LoopWithReductionTC##Itr);                           \
-  void benchForIC1VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC1, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW1LoopWithReductionTC##Itr);                           \
-  void benchForIC2VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC2, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW1LoopWithReductionTC##Itr);                           \
-  void benchForIC4VW1LoopWithReductionTC##Itr(benchmark::State &state) {       \
-    runBenchForLoopInterleaving(state, &loopWithReductionWithVW1IC4, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW1LoopWithReductionTC##Itr);                           \
-  void benchAutoVecForBigLoopTC##Itr(benchmark::State &state) {                \
-    runBenchForLoopInterleaving(state, &loopNoReductionAutoVec, Itr);          \
-  }                                                                            \
-  BENCHMARK(benchAutoVecForBigLoopTC##Itr);                                    \
-  void benchForIC1VW4BigLoopTC##Itr(benchmark::State &state) {                 \
-    runBenchForLoopInterleaving(state, &bigLoopWithVW4IC1, Itr);               \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW4BigLoopTC##Itr);                                     \
-  void benchForIC2VW4BigLoopTC##Itr(benchmark::State &state) {                 \
-    runBenchForLoopInterleaving(state, &bigLoopWithVW4IC2, Itr);               \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW4BigLoopTC##Itr);                                     \
-  void benchForIC4VW4BigLoopTC##Itr(benchmark::State &state) {                 \
-    runBenchForLoopInterleaving(state, &bigLoopWithVW4IC4, Itr);               \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW4BigLoopTC##Itr);                                     \
-  void benchForBigLoopWithReductionAutoVecTC##Itr(benchmark::State &state) {   \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionAutoVec, Itr);     \
-  }                                                                            \
-  BENCHMARK(benchForBigLoopWithReductionAutoVecTC##Itr);                       \
-  void benchForIC1VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC1, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW4BigLoopWithReductionTC##Itr);                        \
-  void benchForIC2VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC2, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW4BigLoopWithReductionTC##Itr);                        \
-  void benchForIC4VW4BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW4IC4, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW4BigLoopWithReductionTC##Itr);                        \
-  void benchForIC1VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC1, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC1VW1BigLoopWithReductionTC##Itr);                        \
-  void benchForIC2VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC2, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC2VW1BigLoopWithReductionTC##Itr);                        \
-  void benchForIC4VW1BigLoopWithReductionTC##Itr(benchmark::State &state) {    \
-    runBenchForLoopInterleaving(state, &bigLoopWithReductionWithVW1IC4, Itr);  \
-  }                                                                            \
-  BENCHMARK(benchForIC4VW1BigLoopWithReductionTC##Itr);
-
-ADD_BENCHMARK(1)
-ADD_BENCHMARK(2)
-ADD_BENCHMARK(3)
-ADD_BENCHMARK(4)
-ADD_BENCHMARK(5)
-ADD_BENCHMARK(6)
-ADD_BENCHMARK(7)
-ADD_BENCHMARK(8)
-ADD_BENCHMARK(9)
-ADD_BENCHMARK(10)
-ADD_BENCHMARK(11)
-ADD_BENCHMARK(12)
-ADD_BENCHMARK(13)
-ADD_BENCHMARK(14)
-ADD_BENCHMARK(15)
-ADD_BENCHMARK(16)
-ADD_BENCHMARK(17)
-ADD_BENCHMARK(18)
-ADD_BENCHMARK(19)
-ADD_BENCHMARK(20)
-ADD_BENCHMARK(21)
-ADD_BENCHMARK(22)
-ADD_BENCHMARK(23)
-ADD_BENCHMARK(24)
-ADD_BENCHMARK(25)
-ADD_BENCHMARK(26)
-ADD_BENCHMARK(27)
-ADD_BENCHMARK(28)
-ADD_BENCHMARK(29)
-ADD_BENCHMARK(30)
-ADD_BENCHMARK(31)
-ADD_BENCHMARK(32)
-ADD_BENCHMARK(33)
-ADD_BENCHMARK(34)
-ADD_BENCHMARK(35)
-ADD_BENCHMARK(36)
-ADD_BENCHMARK(37)
-ADD_BENCHMARK(38)
-ADD_BENCHMARK(39)
-ADD_BENCHMARK(40)
-ADD_BENCHMARK(41)
-ADD_BENCHMARK(42)
-ADD_BENCHMARK(43)
-ADD_BENCHMARK(44)
-ADD_BENCHMARK(45)
-ADD_BENCHMARK(46)
-ADD_BENCHMARK(47)
-ADD_BENCHMARK(48)
-ADD_BENCHMARK(49)
-ADD_BENCHMARK(50)
-ADD_BENCHMARK(51)
-ADD_BENCHMARK(52)
-ADD_BENCHMARK(53)
-ADD_BENCHMARK(54)
-ADD_BENCHMARK(55)
-ADD_BENCHMARK(56)
-ADD_BENCHMARK(57)
-ADD_BENCHMARK(58)
-ADD_BENCHMARK(59)
-ADD_BENCHMARK(60)
-ADD_BENCHMARK(61)
-ADD_BENCHMARK(62)
-ADD_BENCHMARK(63)
-ADD_BENCHMARK(64)
-ADD_BENCHMARK(65)
-ADD_BENCHMARK(66)
-ADD_BENCHMARK(67)
-ADD_BENCHMARK(68)
-ADD_BENCHMARK(69)
-ADD_BENCHMARK(70)
-ADD_BENCHMARK(71)
-ADD_BENCHMARK(72)
-ADD_BENCHMARK(73)
-ADD_BENCHMARK(74)
-ADD_BENCHMARK(75)
-ADD_BENCHMARK(76)
-ADD_BENCHMARK(77)
-ADD_BENCHMARK(78)
-ADD_BENCHMARK(79)
-ADD_BENCHMARK(80)
-ADD_BENCHMARK(81)
-ADD_BENCHMARK(82)
-ADD_BENCHMARK(83)
-ADD_BENCHMARK(84)
-ADD_BENCHMARK(85)
-ADD_BENCHMARK(86)
-ADD_BENCHMARK(87)
-ADD_BENCHMARK(88)
-ADD_BENCHMARK(89)
-ADD_BENCHMARK(90)
-ADD_BENCHMARK(91)
-ADD_BENCHMARK(92)
-ADD_BENCHMARK(93)
-ADD_BENCHMARK(94)
-ADD_BENCHMARK(95)
-ADD_BENCHMARK(96)
-ADD_BENCHMARK(97)
-ADD_BENCHMARK(98)
-ADD_BENCHMARK(99)
-ADD_BENCHMARK(100)
-ADD_BENCHMARK(101)
-ADD_BENCHMARK(102)
-ADD_BENCHMARK(103)
-ADD_BENCHMARK(104)
-ADD_BENCHMARK(105)
-ADD_BENCHMARK(106)
-ADD_BENCHMARK(107)
-ADD_BENCHMARK(108)
-ADD_BENCHMARK(109)
-ADD_BENCHMARK(110)
-ADD_BENCHMARK(111)
-ADD_BENCHMARK(112)
-ADD_BENCHMARK(113)
-ADD_BENCHMARK(114)
-ADD_BENCHMARK(115)
-ADD_BENCHMARK(116)
-ADD_BENCHMARK(117)
-ADD_BENCHMARK(118)
-ADD_BENCHMARK(119)
-ADD_BENCHMARK(120)
-ADD_BENCHMARK(121)
-ADD_BENCHMARK(122)
-ADD_BENCHMARK(123)
-ADD_BENCHMARK(124)
-ADD_BENCHMARK(125)
-ADD_BENCHMARK(126)
-ADD_BENCHMARK(127)
-ADD_BENCHMARK(128)