-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LoopInterchange] Add metadata to control loop-interchange #127474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Ryotaro Kasuga (kasuga-fj) ChangesThis patch adds metadata to enable/disable the loop-interchange for a loop nest. This is a prelude to introduce a new pragma directive for loop-interchange, like other loop optimizations (unroll, vectorize, distribute, etc.) have. Patch is 32.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127474.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index f45d90ff13e14..d2f4ede6176ac 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -67,8 +67,6 @@ static cl::opt<unsigned int> MaxMemInstrCount(
namespace {
-using LoopVector = SmallVector<Loop *, 8>;
-
// TODO: Check if we can use a sparse matrix here.
using CharMatrix = std::vector<std::vector<char>>;
@@ -84,6 +82,14 @@ static cl::opt<unsigned int> MaxLoopNestDepth(
"loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden,
cl::desc("Maximum depth of loop nest considered for the transform"));
+// Whether to apply by default.
+// TODO: Once this pass is enabled by default, remove this option and use the
+// value of PipelineTuningOptions.
+static cl::opt<bool> OnlyWhenForced(
+ "loop-interchange-only-when-forced", cl::init(false), cl::ReallyHidden,
+ cl::desc(
+ "Apply interchanges only when explicitly specified metadata exists"));
+
#ifndef NDEBUG
static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
@@ -233,7 +239,7 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
return true;
}
-static void populateWorklist(Loop &L, LoopVector &LoopList) {
+static void populateWorklist(Loop &L, SmallVectorImpl<Loop *> &LoopList) {
LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
<< L.getHeader()->getParent()->getName() << " Loop: %"
<< L.getHeader()->getName() << '\n');
@@ -245,7 +251,7 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) {
// nested.
// Discard all loops above it added into Worklist.
if (Vec->size() != 1) {
- LoopList = {};
+ LoopList.clear();
return;
}
@@ -256,27 +262,6 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) {
LoopList.push_back(CurrentLoop);
}
-static bool hasSupportedLoopDepth(SmallVectorImpl<Loop *> &LoopList,
- OptimizationRemarkEmitter &ORE) {
- unsigned LoopNestDepth = LoopList.size();
- if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
- LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
- << ", the supported range is [" << MinLoopNestDepth
- << ", " << MaxLoopNestDepth << "].\n");
- Loop **OuterLoop = LoopList.begin();
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
- (*OuterLoop)->getStartLoc(),
- (*OuterLoop)->getHeader())
- << "Unsupported depth of loop nest, the supported range is ["
- << std::to_string(MinLoopNestDepth) << ", "
- << std::to_string(MaxLoopNestDepth) << "].\n";
- });
- return false;
- }
- return true;
-}
-
static bool isComputableLoopNest(ScalarEvolution *SE,
ArrayRef<Loop *> LoopList) {
for (Loop *L : LoopList) {
@@ -299,6 +284,26 @@ static bool isComputableLoopNest(ScalarEvolution *SE,
namespace {
+/// LoopInterchangeList manages the list of loops and the range to which the
+/// interchange may be applied.
+struct LoopInterchangeList {
+ SmallVector<Loop *, 8> LoopList;
+ unsigned ListBegin = 0;
+ unsigned ListEnd = 0;
+
+ LoopInterchangeList(LoopNest &LN)
+ : LoopList(LN.getLoops()), ListBegin(0), ListEnd(LoopList.size()) {}
+
+ LoopInterchangeList(Loop &L) {
+ populateWorklist(L, LoopList);
+ ListBegin = 0;
+ ListEnd = LoopList.size();
+ }
+
+ void checkMetadata(bool OnlyWhenForced);
+ bool hasSupportedLoopDepth(OptimizationRemarkEmitter &ORE);
+};
+
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
class LoopInterchangeLegality {
public:
@@ -439,39 +444,38 @@ struct LoopInterchange {
bool run(Loop *L) {
if (L->getParentLoop())
return false;
- SmallVector<Loop *, 8> LoopList;
- populateWorklist(*L, LoopList);
- return processLoopList(LoopList);
+ LoopInterchangeList LIL(*L);
+ return processLoopList(LIL);
}
- bool run(LoopNest &LN) {
- SmallVector<Loop *, 8> LoopList(LN.getLoops());
+ bool run(LoopInterchangeList &LIL) {
+ const auto &LoopList = LIL.LoopList;
for (unsigned I = 1; I < LoopList.size(); ++I)
if (LoopList[I]->getParentLoop() != LoopList[I - 1])
return false;
- return processLoopList(LoopList);
+ return processLoopList(LIL);
}
- unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
+ unsigned selectLoopForInterchange(LoopInterchangeList &LIL) {
// TODO: Add a better heuristic to select the loop to be interchanged based
// on the dependence matrix. Currently we select the innermost loop.
- return LoopList.size() - 1;
+ return LIL.ListEnd - 1;
}
- bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
+ bool processLoopList(LoopInterchangeList &LIL) {
bool Changed = false;
// Ensure proper loop nest depth.
- assert(hasSupportedLoopDepth(LoopList, *ORE) &&
+ assert(LIL.hasSupportedLoopDepth(*ORE) &&
"Unsupported depth of loop nest.");
- unsigned LoopNestDepth = LoopList.size();
+ unsigned LoopNestDepth = LIL.LoopList.size();
LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
<< "\n");
CharMatrix DependencyMatrix;
- Loop *OuterMostLoop = *(LoopList.begin());
+ Loop *OuterMostLoop = *(LIL.LoopList.begin());
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
OuterMostLoop, DI, SE, ORE)) {
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
@@ -488,7 +492,7 @@ struct LoopInterchange {
return false;
}
- unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+ unsigned SelectLoopId = selectLoopForInterchange(LIL);
// Obtain the loop vector returned from loop cache analysis beforehand,
// and put each <Loop, index> pair into a map for constant time query
// later. Indices in loop vector reprsent the optimal order of the
@@ -504,19 +508,20 @@ struct LoopInterchange {
CostMap[LoopCosts[i].first] = i;
}
}
+
// We try to achieve the globally optimal memory access for the loopnest,
// and do interchange based on a bubble-sort fasion. We start from
// the innermost loop, move it outwards to the best possible position
// and repeat this process.
- for (unsigned j = SelecLoopId; j > 0; j--) {
+ for (unsigned j = LIL.ListEnd - LIL.ListBegin - 1; j > 0; j--) {
bool ChangedPerIter = false;
- for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
- bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
- DependencyMatrix, CostMap);
+ for (unsigned i = SelectLoopId; i > SelectLoopId - j; i--) {
+ bool Interchanged = processLoop(LIL.LoopList[i], LIL.LoopList[i - 1], i,
+ i - 1, DependencyMatrix, CostMap);
if (!Interchanged)
continue;
// Loops interchanged, update LoopList accordingly.
- std::swap(LoopList[i - 1], LoopList[i]);
+ std::swap(LIL.LoopList[i - 1], LIL.LoopList[i]);
// Update the DependencyMatrix
interChangeDependencies(DependencyMatrix, i, i - 1);
@@ -526,6 +531,7 @@ struct LoopInterchange {
ChangedPerIter |= Interchanged;
Changed |= Interchanged;
}
+
// Early abort if there was no interchange during an entire round of
// moving loops outwards.
if (!ChangedPerIter)
@@ -572,6 +578,69 @@ struct LoopInterchange {
} // end anonymous namespace
+bool LoopInterchangeList::hasSupportedLoopDepth(
+ OptimizationRemarkEmitter &ORE) {
+ unsigned LoopNestDepth = ListEnd - ListBegin;
+ if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
+ LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
+ << ", the supported range is [" << MinLoopNestDepth
+ << ", " << MaxLoopNestDepth << "].\n");
+ Loop *OuterLoop = LoopList[ListBegin];
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Unsupported depth of loop nest, the supported range is ["
+ << std::to_string(MinLoopNestDepth) << ", "
+ << std::to_string(MaxLoopNestDepth) << "].\n";
+ });
+ return false;
+ }
+ return true;
+}
+
+// Check the metadata for interchange. The outermost one is taken into account
+// and nested ones are ignored. The metadata affects the entire loop nest such
+// that the outermost loop is the loop for which the metadata is specified. For
+// example, in the following example, the loop-interchange will be performed
+// only to the outermost two loops.
+//
+// for (...)
+// for (...)
+// #pragma clang loop interchange(disable)
+// for (...)
+// for (...)
+// for (...)
+// Stmt
+//
+void LoopInterchangeList::checkMetadata(bool OnlyWhenForced) {
+ ListBegin = 0;
+ ListEnd = LoopList.size();
+
+ for (unsigned I = 0; I != LoopList.size(); I++) {
+ Loop *L = LoopList[I];
+ auto Value = findStringMetadataForLoop(L, "llvm.loop.interchange.enable");
+ if (!Value)
+ continue;
+
+ const MDOperand *Op = *Value;
+ assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+ bool Enabled = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+ if (Enabled && OnlyWhenForced) {
+ ListBegin = I;
+ } else if (!Enabled && !OnlyWhenForced) {
+ ListEnd = I;
+ } else if (OnlyWhenForced) {
+ ListEnd = 0;
+ }
+ break;
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LoopInterchange will be applied to the range: [from, to]=["
+ << ListBegin << ", " << ListEnd - 1 << "]\n";);
+}
+
bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
return any_of(*BB, [](const Instruction &I) {
return I.mayHaveSideEffects() || I.mayReadFromMemory();
@@ -1748,7 +1817,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
LoopStandardAnalysisResults &AR,
LPMUpdater &U) {
Function &F = *LN.getParent();
- SmallVector<Loop *, 8> LoopList(LN.getLoops());
+ LoopInterchangeList LIL(LN);
if (MaxMemInstrCount < 1) {
LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
@@ -1757,14 +1826,19 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
OptimizationRemarkEmitter ORE(&F);
// Ensure minimum depth of the loop nest to do the interchange.
- if (!hasSupportedLoopDepth(LoopList, ORE))
+ if (!LIL.hasSupportedLoopDepth(ORE))
return PreservedAnalyses::all();
// Ensure computable loop nest.
- if (!isComputableLoopNest(&AR.SE, LoopList)) {
+ if (!isComputableLoopNest(&AR.SE, LIL.LoopList)) {
LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
return PreservedAnalyses::all();
}
+ LIL.checkMetadata(OnlyWhenForced);
+ // Ensure the depth again.
+ if (!LIL.hasSupportedLoopDepth(ORE))
+ return PreservedAnalyses::all();
+
ORE.emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
LN.getOutermostLoop().getStartLoc(),
@@ -1776,7 +1850,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
std::unique_ptr<CacheCost> CC =
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
- if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
+ if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LIL))
return PreservedAnalyses::all();
U.markLoopNestChanged(true);
return getLoopPassPreservedAnalyses();
diff --git a/llvm/test/Transforms/LoopInterchange/metadata.ll b/llvm/test/Transforms/LoopInterchange/metadata.ll
new file mode 100644
index 0000000000000..9838abb905a7e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/metadata.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-ON
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=1 --cache-line-size=64 -S < %s | FileCheck %s --check-prefix=DEFAULT-OFF
+
+; Test if the metadata works correctly. The code is as follows:
+;
+; #define N 4
+; int a[N][N][N][N];
+; int b[N][N][N][N];
+; void f() {
+; for (int i = 0; i < N; i++)
+; for (int j = 0; j < N; j++)
+; #pragma clang loop interchange(enable or disable)
+; for (int k = 0; k < N; k++)
+; for (int l = 0; l < N; l++)
+; a[l][k][j][i] += b[l][k][j][i];
+; }
+;
+; In the functions explicit_on and explicit_off, the values enable and disable
+; are specified in the pragma, respectively. If the
+; `loop-interchange-only-when-forced` is set to 0, the loop-interchange will be
+; performed to the loop nest unless it is explicitly disabled. If the value is
+; set to 1, the loop-interchange will be performed to the loop nest only when
+; it is explicitly enabled.
+
+@a = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
+@b = dso_local local_unnamed_addr global [2 x [2 x [2 x [2 x i32]]]] zeroinitializer, align 4
+
+define void @explicit_on() {
+; DEFAULT-ON-LABEL: define void @explicit_on() {
+; DEFAULT-ON-NEXT: [[ENTRY:.*:]]
+; DEFAULT-ON-NEXT: br label %[[FOR_BODY12_PREHEADER:.*]]
+; DEFAULT-ON: [[FOR_COND1_PREHEADER_PREHEADER:.*]]:
+; DEFAULT-ON-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-ON: [[FOR_COND1_PREHEADER]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV61:%.*]] = phi i64 [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT: br label %[[FOR_BODY12_SPLIT1:.*]]
+; DEFAULT-ON: [[FOR_COND5_PREHEADER_PREHEADER:.*]]:
+; DEFAULT-ON-NEXT: br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP3]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-ON-NEXT: [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-ON-NEXT: br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP7_SPLIT:.*]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP7:.*]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-ON-NEXT: [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-ON-NEXT: br label %[[FOR_COND_CLEANUP3]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP7_SPLIT]]:
+; DEFAULT-ON-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV57]], 1
+; DEFAULT-ON-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 2
+; DEFAULT-ON-NEXT: br i1 [[TMP1]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP11_SPLIT:.*]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-ON-NEXT: [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-ON-NEXT: br label %[[FOR_COND_CLEANUP7]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP11_SPLIT]]:
+; DEFAULT-ON-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV53]], 1
+; DEFAULT-ON-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 2
+; DEFAULT-ON-NEXT: br i1 [[TMP3]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-ON: [[FOR_BODY12:.*]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP6:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER]] ]
+; DEFAULT-ON-NEXT: br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
+; DEFAULT-ON: [[FOR_BODY12_SPLIT1]]:
+; DEFAULT-ON-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-ON-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-ON-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
+; DEFAULT-ON-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX26]], align 4
+; DEFAULT-ON-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-ON-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 2
+; DEFAULT-ON-NEXT: br label %[[FOR_COND_CLEANUP11]]
+; DEFAULT-ON: [[FOR_BODY12_SPLIT]]:
+; DEFAULT-ON-NEXT: [[TMP6]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; DEFAULT-ON-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 2
+; DEFAULT-ON-NEXT: br i1 [[TMP7]], label %[[FOR_BODY12]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-ON: [[FOR_COND9_PREHEADER]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV53]] = phi i64 [ [[TMP2]], %[[FOR_COND_CLEANUP11_SPLIT]] ], [ 0, %[[FOR_COND9_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT: br label %[[FOR_COND5_PREHEADER_PREHEADER]]
+; DEFAULT-ON: [[FOR_BODY12_PREHEADER]]:
+; DEFAULT-ON-NEXT: br label %[[FOR_BODY12]]
+; DEFAULT-ON: [[FOR_COND5_PREHEADER]]:
+; DEFAULT-ON-NEXT: [[INDVARS_IV57]] = phi i64 [ [[TMP0]], %[[FOR_COND_CLEANUP7_SPLIT]] ], [ 0, %[[FOR_COND5_PREHEADER_PREHEADER]] ]
+; DEFAULT-ON-NEXT: br label %[[FOR_COND1_PREHEADER_PREHEADER]]
+; DEFAULT-ON: [[FOR_COND9_PREHEADER_PREHEADER]]:
+; DEFAULT-ON-NEXT: br label %[[FOR_COND9_PREHEADER]]
+; DEFAULT-ON: [[FOR_COND_CLEANUP]]:
+; DEFAULT-ON-NEXT: ret void
+;
+; DEFAULT-OFF-LABEL: define void @explicit_on() {
+; DEFAULT-OFF-NEXT: [[ENTRY:.*]]:
+; DEFAULT-OFF-NEXT: br label %[[FOR_COND1_PREHEADER:.*]]
+; DEFAULT-OFF: [[FOR_COND1_PREHEADER]]:
+; DEFAULT-OFF-NEXT: [[INDVARS_IV61:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT62:%.*]], %[[FOR_COND_CLEANUP3:.*]] ]
+; DEFAULT-OFF-NEXT: br label %[[FOR_COND5_PREHEADER:.*]]
+; DEFAULT-OFF: [[FOR_COND_CLEANUP3]]:
+; DEFAULT-OFF-NEXT: [[INDVARS_IV_NEXT62]] = add nuw nsw i64 [[INDVARS_IV61]], 1
+; DEFAULT-OFF-NEXT: [[EXITCOND64:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT62]], 2
+; DEFAULT-OFF-NEXT: br i1 [[EXITCOND64]], label %[[FOR_COND1_PREHEADER]], label %[[FOR_COND_CLEANUP:.*]]
+; DEFAULT-OFF: [[FOR_COND_CLEANUP7:.*]]:
+; DEFAULT-OFF-NEXT: [[INDVARS_IV_NEXT58:%.*]] = add nuw nsw i64 [[INDVARS_IV57:%.*]], 1
+; DEFAULT-OFF-NEXT: [[EXITCOND60:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT58]], 2
+; DEFAULT-OFF-NEXT: br i1 [[EXITCOND60]], label %[[FOR_COND5_PREHEADER]], label %[[FOR_COND_CLEANUP3]]
+; DEFAULT-OFF: [[FOR_COND_CLEANUP11:.*]]:
+; DEFAULT-OFF-NEXT: [[INDVARS_IV_NEXT54:%.*]] = add nuw nsw i64 [[INDVARS_IV53:%.*]], 1
+; DEFAULT-OFF-NEXT: [[EXITCOND56:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT54]], 2
+; DEFAULT-OFF-NEXT: br i1 [[EXITCOND56]], label %[[FOR_COND9_PREHEADER:.*]], label %[[FOR_BODY12_SPLIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-OFF: [[FOR_BODY12:.*]]:
+; DEFAULT-OFF-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP2:%.*]], %[[FOR_BODY12_SPLIT]] ], [ 0, %[[FOR_BODY12_PREHEADER:.*]] ]
+; DEFAULT-OFF-NEXT: br label %[[FOR_COND9_PREHEADER_PREHEADER:.*]]
+; DEFAULT-OFF: [[FOR_BODY12_SPLIT1:.*]]:
+; DEFAULT-OFF-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @b, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
+; DEFAULT-OFF-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds nuw [2 x [2 x [2 x [2 x i32]]]], ptr @a, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV53]], i64 [[INDVARS_IV57]], i64 [[INDVARS_IV61]]
+; DEFAULT-OFF-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4
+; DEFAULT-OFF-N...
[truncated]
|
5d41708
to
2236c18
Compare
// Check the metadata for interchange. The outermost one is taken into account | ||
// and nested ones are ignored. The metadata affects the entire loop nest such | ||
// that the outermost loop is the loop for which the metadata is specified. For | ||
// example, in the following example, the loop-interchange will be performed | ||
// only to the outermost two loops, and the second pragma is ignored. | ||
// | ||
// for (...) | ||
// for (...) | ||
// #pragma clang loop interchange(disable) | ||
// for (...) | ||
// #pragma clang loop interchange(enable) | ||
// for (...) | ||
// for (...) | ||
// Stmt | ||
// |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since LoopInterchange processes a loop nest, I think it makes sense that the pragma applies to the entire loop, not just the two loops it's specified for. However, with or without the interchange pragma, I'm not sure how to properly handle any other pragmas. Currently, LoopInterchange doesn't change the metadata of transformed loops. So, if the loops are interchanged in the following case:
#pragma unroll_and_jam
for (int j = 0; j < N; j++)
for (int i = 0; i < N; i++)
...
then it looks like:
for (int i = 0; i < N; i++)
#pragma unroll_and_jam
for (int j = 0; j < N; j++)
...
Is this a desired behavior? Or is there a better way to handle it? This may be relevant to what we should set in the follow-up metadata for the interchange. IMHO the current behavior is reasonable (i.e., in the above case, unroll_and_jam is not performed due to "unsupported transformation ordering"). Would like to hear any opinions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you look into CGLoopInfo.cpp
, the metadata comonly have a followup_*
property. This is so frontends can specify which other metadata/loop transformations to apply on the loops after unroll/vectorize/distribute/... has been applied. E.g. it could specify to apply unroll_and_jam
on the i-loop, which becomes the outer loop after the interchange has been applied. For insterchange, this could be used to specify multiple pairwise exchanges, one following the other.
Unfortunately the system is only implemented incompletely. Only those series of loop transformations can be applied that follow the order of the pipeline. For instance, one cannot apply loop distribution after vectorization, because in LLVM's default pipeline, vectorize comes after loop distribution. This is an implementation detail that should not be visible to the user. The idea was once to apply passes until all such metadata has been consumed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your comment.
E.g. it could specify to apply unroll_and_jam on the i-loop, which becomes the outer loop after the interchange has been applied.
IIUIC, we can rewrite the metadata and move the metadata for unroll_and_jam
from the j-loop to i-loop. What I'm curious is that, is this desirable behavior for the user? IMO this behavior is a bit confusing for the user. The best solution here would be to apply unroll_and_jam before interchange, but as you said, it's impossible due to the order depends on the default pipeline.
For insterchange, this could be used to specify multiple pairwise exchanges, one following the other.
You meant that, in the code below, the i-loop and j-loop are exchanged first, then the k-loop and l-loop are exchanged. Is my understanding correct?
#pragma clang loop interchange(enable)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
#pragma clang loop interchange(enable)
for (int k = 0; k < N; k++) {
for (int l = 0; l < N; l++) {
...
}
}
}
}
At first I tried implementing it in this way. But now I feel it would be more useful to affect the entire loop nest rather than against a loop pairwise, since LoopInterchange handles a loop nest. That is, in the code above, loop interchange is applied to all four loops, and the new order of the loop nest is determined by the LoopInterchange pass. Or, if we want to disable the interchange for some reason and specify it as below, it would be better to stop applying the interchange not only to the i-loop and j-loop, but to the entire loop nest (i.e., including the k-loop and l-loop). WDYT?
#pragma clang loop interchange(disable)
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
for (int l = 0; l < N; l++) {
...
}
}
}
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What I'm curious is that, is this desirable behavior for the user? IMO this behavior is a bit confusing for the user. The best solution here would be to apply unroll_and_jam before interchange, but as you said, it's impossible due to the order depends on the default pipeline.
For the existing #pragma clang loop
, the order is encoded in the order of the passes in the default pass pipeline, so to not cause any changed begavior. This means that e.g.
#pragma clang loop unroll_count(4)
#pragma clang loop vectorize(enable)
is the same behaviour as
#pragma clang loop vectorize(enable)
#pragma clang loop unroll_count(4)
or even
#pragma clang loop unroll_count(4) vectorize(enable)
This is not something I could change. I would have tried to fix that at one point, but atm I think the frontend would just warn/error out if something looks like it is applied in a specific order, but currently cannot because of pipeline order.
You meant that, in the code below, the i-loop and j-loop are exchanged first, then the k-loop and l-loop are exchanged. Is my understanding correct?
Yes.
However, if the loops to be interchange were overlapping, the metadata has additional information about the order that you could not represent with pragmas like this. E.g.
#pragma clang loop interchange(enable)
for (int i = 0; i < N; i++) {
#pragma clang loop interchange(enable)
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
Do we apply "j <=> k" first, then "k <=> i"? Or does it apply "i <=> j", then "j <=> k"?
That problem is not solved when using pragmas that specified the entire permutations, they can still overlap:
#pragma clang loop interchange_permutation(j,i,k)
for (int i = 0; i < N; i++) {
#pragma clang loop interchange_permutation(k,j)
for (int j = 0; j < N; j++) {
for (int k = 0; k < N; k++) {
Although you could just error-out/warn if the user writes such thing. But to repeat: the metadata representation is independent of how you design the #pragma
syntax. It just should be unambiguous. Unlike pragmas, you can use groups to select a set of loops that belong to a loop nest. E.g. see llvm.loop.parallel_accesses
and llvm.access.group
.
At first I tried implementing it in this way. But now I feel it would be more useful to affect the entire loop nest rather than against a loop pairwise, since LoopInterchange handles a loop nest.
That would be fine by me, both representations should be equally powefull.
The idea was instead of having one mighty metadata, to have smaller, composable metadata. Smaller units is what usually an IR is all about,
Or, if we want to disable the interchange for some reason and specify it as below, it would be better to stop applying the interchange not only to the i-loop and j-loop, but to the entire loop nest (i.e., including the k-loop and l-loop). WDYT?
I think #pragma clang loop
only applies on one loop. So in the example, only i
is not allowed to participate in any loop interchange (either as outer or inner loops). Anything else would be unpredictable. What if j
was not cosidered a counted loop by LoopInterchange? Then k
and l
would sill be their own loop nest and not see the interchange.disable
metadata.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your great advice! It's really helpful.
The idea was instead of having one mighty metadata, to have smaller, composable metadata. Smaller units is what usually an IR is all about,
It makes sense to me, I'm going to try this approach. Please allow me to ask for a review upon completion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Specifying what the new order is (with more than two loops) using e.g.
!{!"llvm.loop.interchange.permutation", !j, !k, !i}
is perfectly reasonable as well. That permutation wouldn't be needed if we are always interchanging just two loops, but encoding everything in a series of followups might be challanging:
!i = !{!i , !interchange_enable, !interchange_ij, !interchange_outer_followup, !interchange_inner_followup }
!interchange_enable = !{"llvm.loop.interchange.enable", !1}
!interchange_ij = !{!"llvm.loop.interchange_with", !j} # Not strictly necessary, there is only one inner loop that can be exchanged with, but useful to check for something fishy going on, such as another pass having modified the loops
!interchange_outer_followup = !{!"llvm.loop.interchange.outer_followup", !j_interchanged}
!interchange_inner_followup = !{!"llvm.loop.interchange.inner_followup", !i_interchanged}
!j_interchanged = !{}
!i_interchanged = !{!interchange_enable, !interchange_ik}
!interchange_ik= !{!"llvm.loop.interchange_with", !k}
Keep in mind that LoopIDs are not identifiers. They change even when just a property is addeded to them, e.g. llvm.loop.isvectorized
for marking a loop as non-vectorizable, so do not try again.
/// LoopInterchangeList manages the list of loops and the range to which the | ||
/// interchange may be applied. | ||
struct LoopInterchangeList { | ||
SmallVector<Loop *, 8> LoopList; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not specifiying the new order of the loops within the nest?
// Check the metadata for interchange. The outermost one is taken into account | ||
// and nested ones are ignored. The metadata affects the entire loop nest such | ||
// that the outermost loop is the loop for which the metadata is specified. For | ||
// example, in the following example, the loop-interchange will be performed | ||
// only to the outermost two loops, and the second pragma is ignored. | ||
// | ||
// for (...) | ||
// for (...) | ||
// #pragma clang loop interchange(disable) | ||
// for (...) | ||
// #pragma clang loop interchange(enable) | ||
// for (...) | ||
// for (...) | ||
// Stmt | ||
// |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you look into CGLoopInfo.cpp
, the metadata comonly have a followup_*
property. This is so frontends can specify which other metadata/loop transformations to apply on the loops after unroll/vectorize/distribute/... has been applied. E.g. it could specify to apply unroll_and_jam
on the i-loop, which becomes the outer loop after the interchange has been applied. For insterchange, this could be used to specify multiple pairwise exchanges, one following the other.
Unfortunately the system is only implemented incompletely. Only those series of loop transformations can be applied that follow the order of the pipeline. For instance, one cannot apply loop distribution after vectorization, because in LLVM's default pipeline, vectorize comes after loop distribution. This is an implementation detail that should not be visible to the user. The idea was once to apply passes until all such metadata has been consumed.
Some post-processing involved in exchanging a pair of loops has been done separately from `processLoop`, which is a main function that does the transformation. It's better to consolidate these processes into the same function. This patch is a preparation of llvm#127474.
) Some post-processing involved in exchanging a pair of loops has been done separately from `processLoop`, which is a main function that does the transformation. It's better to consolidate these processes into the same function. This patch is a preparation of #127474.
This patch adds metadata to enable/disable the loop-interchange for a loop nest. This is a prelude to introduce a new pragma directive for loop-interchange, like other loop optimizations (unroll, vectorize, distribute, etc.) have.
0ef2ccf
to
72ff917
Compare
72ff917
to
2418ad8
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @Meinersbur .Thank you for all the help and insights on the other day. I fixed the implementation and updated the description of the PR, could you please check again? Thanks!
// Update the metadata. | ||
std::optional<MDNode *> MDNextOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupNextOuter}); | ||
std::optional<MDNode *> MDOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupOuter}); | ||
std::optional<MDNode *> MDInnerLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupInner}); | ||
if (MDNextOuterLoopID) { | ||
if (NextOuterLoop) { | ||
NextOuterLoop->setLoopID(*MDNextOuterLoopID); | ||
} else { | ||
LLVM_DEBUG(dbgs() | ||
<< "New metadata for the next outer loop is ignored.\n"); | ||
} | ||
} | ||
if (MDOuterLoopID) | ||
OuterLoop->setLoopID(*MDOuterLoopID); | ||
if (MDInnerLoopID) | ||
InnerLoop->setLoopID(*MDInnerLoopID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because this is inside the function processEnabledLoop
, these metadata processes are not performed by default (i.e., OnlyWhenForced
is false). However, if I understand it correctly, follow-up metadata will be generated whenever the pragma enabling interchange is specified, e.g., the following case:
// compilation options: -O3 -floop-interchange
#pragma clang loop interchange(enable) unroll(disable) // Enabling interchange by pragma doesn't make sense, since loop-interchange is enabled by the compilation option.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
...
So should we also handle the follow-up metadata in processLoopList
? If my understanding is correct, any other loop optimization passes (like unroll, distribute, etc.) don't handle them in the similar situation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Loops should be re-added to the worklist, so any followup loop interchange can be processed.
Other passes only apply just one transformation, usually because doing it multiple times is nonsensical. E.g. wou would not want to vectorize a loop twice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I misunderstood other passes' behavior.
Loops should be re-added to the worklist, so any followup loop interchange can be processed.
Yes, this is already done.
My concern is how to handle metadata when we use the bubble sort algorithm. Do the changes in 7eff317 make sense?
|
||
// Hold outer loops to be exchanged (i.e., loops that have | ||
// "llvm.loop.interchange.enable" is true), in the current nest order. | ||
SmallVector<Loop *, 4> Worklist; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As for the cases where the loops to be interchanged do not interfere with each other, I plan to handle them independently. That is, the following code
#pragma clang loop interchange(enable)
for (i=0; i<N; i++)
for (j=0; j<N; j++)
#pragma clang loop interchange(enable)
for (k=0; k<N; k++)
for (l=0; l<N; l++)
...
will be translated like as
!interchange_ij = !{!"interchange_ij", !interchange_enable}
!interchange_kl = !{!"interchange_kl", !interchange_enable}
!interchange_enable = !{!"llvm.loop.interchange.enable", i1 true}
not as follows.
!interchange_kl = !{!"interchange_kl", !interchange_enable, !followup_ij}
!interchange_enable = !{!"llvm.loop.interchange.enable", i1 true}
!followup_ij = !{"followup_next_next_outer", ...}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fine, but also the less interesting case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider adding the loop interchange metadata to https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp, such that for whatever reason there is still interchange instructions on some loops, the user gets a warning.
|
||
// Hold outer loops to be exchanged (i.e., loops that have | ||
// "llvm.loop.interchange.enable" is true), in the current nest order. | ||
SmallVector<Loop *, 4> Worklist; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fine, but also the less interesting case.
// Set an upper bound of the number of transformations to avoid infinite | ||
// loop. There is no deep meaning behind the current value (square of the | ||
// size of LoopList). | ||
// TODO: Is this really necessary? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the metadata is properly remove after having applied the interchange, then not. Consider an assertion for debug builds when that happens.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 0e954b3
// Update the metadata. | ||
std::optional<MDNode *> MDNextOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupNextOuter}); | ||
std::optional<MDNode *> MDOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupOuter}); | ||
std::optional<MDNode *> MDInnerLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupInner}); | ||
if (MDNextOuterLoopID) { | ||
if (NextOuterLoop) { | ||
NextOuterLoop->setLoopID(*MDNextOuterLoopID); | ||
} else { | ||
LLVM_DEBUG(dbgs() | ||
<< "New metadata for the next outer loop is ignored.\n"); | ||
} | ||
} | ||
if (MDOuterLoopID) | ||
OuterLoop->setLoopID(*MDOuterLoopID); | ||
if (MDInnerLoopID) | ||
InnerLoop->setLoopID(*MDInnerLoopID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Loops should be re-added to the worklist, so any followup loop interchange can be processed.
Other passes only apply just one transformation, usually because doing it multiple times is nonsensical. E.g. wou would not want to vectorize a loop twice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider adding the loop interchange metadata to https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp, such that for whatever reason there is still interchange instructions on some loops, the user gets a warning.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider adding the loop interchange metadata to https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp, such that for whatever reason there is still interchange instructions on some loops, the user gets a warning.
if (hasInterchangeTransformation(L) == TM_ForcedByUser) { | ||
LLVM_DEBUG(dbgs() << "Leftover interchange transformation\n"); | ||
ORE->emit( | ||
DiagnosticInfoOptimizationFailure(DEBUG_TYPE, | ||
"FailedRequestedInterchange", | ||
L->getStartLoc(), L->getHeader()) | ||
<< "loop not interchanged: the optimizer was unable to perform the " | ||
"requested transformation; the transformation might be disabled or " | ||
"specified as part of an unsupported transformation ordering"); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider adding the loop interchange metadata to https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp, such that for whatever reason there is still interchange instructions on some loops, the user gets a warning.
Is this what you intended?
// Set an upper bound of the number of transformations to avoid infinite | ||
// loop. There is no deep meaning behind the current value (square of the | ||
// size of LoopList). | ||
// TODO: Is this really necessary? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 0e954b3
// Update the metadata. | ||
std::optional<MDNode *> MDNextOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupNextOuter}); | ||
std::optional<MDNode *> MDOuterLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupOuter}); | ||
std::optional<MDNode *> MDInnerLoopID = | ||
makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll, | ||
LLVMLoopInterchangeFollowupInner}); | ||
if (MDNextOuterLoopID) { | ||
if (NextOuterLoop) { | ||
NextOuterLoop->setLoopID(*MDNextOuterLoopID); | ||
} else { | ||
LLVM_DEBUG(dbgs() | ||
<< "New metadata for the next outer loop is ignored.\n"); | ||
} | ||
} | ||
if (MDOuterLoopID) | ||
OuterLoop->setLoopID(*MDOuterLoopID); | ||
if (MDInnerLoopID) | ||
InnerLoop->setLoopID(*MDInnerLoopID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I misunderstood other passes' behavior.
Loops should be re-added to the worklist, so any followup loop interchange can be processed.
Yes, this is already done.
My concern is how to handle metadata when we use the bubble sort algorithm. Do the changes in 7eff317 make sense?
I've been trying to figure out how the follow-up mechanism works, but I'm getting a little confused. It seems to me that void f(float *a, float x) {
#pragma clang loop vectorize(enable) unroll_count(8)
for (int i = 0; i < 1024; i++) {
a[i] *= x;
}
} LLVM IR before and after loop-vectorize are in https://godbolt.org/z/bde8va1TG. For the output IR, the two loops ( https://godbolt.org/z/bzd8d8WxK
|
) When pragma of loop transformations are encoded in LLVM IR, follow-up metadata is used if multiple transformations are specified. They are used to explicitly express the order of the transformations. However, they are not properly processed on each transformation pass, so now only the first one is attempted to be applied. This is a pre-commit to add a test that causes the problem. ref: #127474 (comment)
…(NFC) (#131337) When pragma of loop transformations are encoded in LLVM IR, follow-up metadata is used if multiple transformations are specified. They are used to explicitly express the order of the transformations. However, they are not properly processed on each transformation pass, so now only the first one is attempted to be applied. This is a pre-commit to add a test that causes the problem. ref: llvm/llvm-project#127474 (comment)
) When pragma of loop transformations are encoded in LLVM IR, follow-up metadata is used if multiple transformations are specified. They are used to explicitly express the order of the transformations. However, they are not properly processed on each transformation pass, so now only the first one is attempted to be applied. This is a pre-commit to add a test that causes the problem. ref: #127474 (comment)
) When pragma of loop transformations are encoded in LLVM IR, follow-up metadata is used if multiple transformations are specified. They are used to explicitly express the order of the transformations. However, they are not properly processed on each transformation pass, so now only the first one is attempted to be applied. This is a pre-commit to add a test that causes the problem. ref: #127474 (comment)
…#129514) Some post-processing involved in exchanging a pair of loops has been done separately from `processLoop`, which is a main function that does the transformation. It's better to consolidate these processes into the same function. This patch is a preparation of llvm#127474.
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); | ||
return false; | ||
|
||
// If the interchange is explicitly enabled, skip the profitability check. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe it is reasonable and have changed to skip the profitability check when the loop has metadata that explicitly specifies exchange.
Gentle ping |
This patch adds metadata to enable/disable the loop-interchange for a loop nest. This is a prelude to introduce a new pragma directive for loop-interchange, like other loop optimizations (unroll, vectorize, distribute, etc.) have.
Because LoopInterchange processes a loop-nest, the follow-up metadata is in a similar format to LoopUnrollAndJam. However, in addition to
followup_outer
andfollowup_inner
, also usefollowup_next_outer
, which will be attached to the next outer loop of the exchanged loop. We can express the application order of exchanges with them, by putting the next transformation to be applied in the follow-up metadata.For example, as for the loops like below,
it will be translated as follows.
LoopInterchange will perform the exchange between the j-loop and k-loop due to
!interchange_jk
, then process the follow-up metadata. As a result,!enable
is appended to the next outer loop, which in this case is the i-loop, so the next interchange is applied to the i-loop and the k-loop.The application order is unambiguous unless there are two adjacent loops and both have metadata to enable interchange. If such loops exist, LoopInterchange will stop executing subsequent processes.