diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 439235f47471e..4990fa9f8b5ea 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -67,6 +67,7 @@ STATISTIC(NumCSE, "Number of instructions CSE'd"); STATISTIC(NumCSECVP, "Number of compare instructions CVP'd"); STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); STATISTIC(NumCSECall, "Number of call instructions CSE'd"); +STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd"); STATISTIC(NumDSE, "Number of trivial dead stores removed"); DEBUG_COUNTER(CSECounter, "early-cse", @@ -143,11 +144,11 @@ struct SimpleValue { !CI->getFunction()->isPresplitCoroutine(); } return isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst); + isa(Inst) || isa(Inst) || + isa(Inst) || isa(Inst) || + isa(Inst) || isa(Inst) || + isa(Inst) || isa(Inst) || + isa(Inst); } }; @@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) { IVI->getOperand(1), hash_combine_range(IVI->idx_begin(), IVI->idx_end())); - assert((isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst) || isa(Inst) || - isa(Inst)) && + assert((isa(Inst) || isa(Inst) || + isa(Inst) || isa(Inst) || + isa(Inst) || isa(Inst)) && "Invalid/unknown instruction"); // Handle intrinsics with commutative operands. @@ -548,11 +548,81 @@ bool DenseMapInfo::isEqual(CallValue LHS, CallValue RHS) { // currently executing, so conservatively return false if they are in // different basic blocks. if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent()) - return false; + return false; return LHSI->isIdenticalTo(RHSI); } +//===----------------------------------------------------------------------===// +// GEPValue +//===----------------------------------------------------------------------===// + +namespace { + +struct GEPValue { + Instruction *Inst; + std::optional ConstantOffset; + + GEPValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + GEPValue(Instruction *I, std::optional ConstantOffset) + : Inst(I), ConstantOffset(ConstantOffset) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo::getEmptyKey() || + Inst == DenseMapInfo::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + return isa(Inst); + } +}; + +} // namespace + +namespace llvm { + +template <> struct DenseMapInfo { + static inline GEPValue getEmptyKey() { + return DenseMapInfo::getEmptyKey(); + } + + static inline GEPValue getTombstoneKey() { + return DenseMapInfo::getTombstoneKey(); + } + + static unsigned getHashValue(const GEPValue &Val); + static bool isEqual(const GEPValue &LHS, const GEPValue &RHS); +}; + +} // end namespace llvm + +unsigned DenseMapInfo::getHashValue(const GEPValue &Val) { + auto *GEP = cast(Val.Inst); + if (Val.ConstantOffset.has_value()) + return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(), + Val.ConstantOffset.value()); + return hash_combine( + GEP->getOpcode(), + hash_combine_range(GEP->value_op_begin(), GEP->value_op_end())); +} + +bool DenseMapInfo::isEqual(const GEPValue &LHS, const GEPValue &RHS) { + if (LHS.isSentinel() || RHS.isSentinel()) + return LHS.Inst == RHS.Inst; + auto *LGEP = cast(LHS.Inst); + auto *RGEP = cast(RHS.Inst); + if (LGEP->getPointerOperand() != RGEP->getPointerOperand()) + return false; + if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value()) + return LHS.ConstantOffset.value() == RHS.ConstantOffset.value(); + return LGEP->isIdenticalToWhenDefined(RGEP); +} + //===----------------------------------------------------------------------===// // EarlyCSE implementation //===----------------------------------------------------------------------===// @@ -647,6 +717,13 @@ class EarlyCSE { ScopedHashTable>; CallHTType AvailableCalls; + using GEPMapAllocatorTy = + RecyclingAllocator>; + using GEPHTType = ScopedHashTable, + GEPMapAllocatorTy>; + GEPHTType AvailableGEPs; + /// This is the current generation of the memory value. unsigned CurrentGeneration = 0; @@ -667,9 +744,11 @@ class EarlyCSE { class NodeScope { public: NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, - InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls) - : Scope(AvailableValues), LoadScope(AvailableLoads), - InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {} + InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls, + GEPHTType &AvailableGEPs) + : Scope(AvailableValues), LoadScope(AvailableLoads), + InvariantScope(AvailableInvariants), CallScope(AvailableCalls), + GEPScope(AvailableGEPs) {} NodeScope(const NodeScope &) = delete; NodeScope &operator=(const NodeScope &) = delete; @@ -678,6 +757,7 @@ class EarlyCSE { LoadHTType::ScopeTy LoadScope; InvariantHTType::ScopeTy InvariantScope; CallHTType::ScopeTy CallScope; + GEPHTType::ScopeTy GEPScope; }; // Contains all the needed information to create a stack for doing a depth @@ -688,13 +768,13 @@ class EarlyCSE { public: StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads, InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls, - unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child, + GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n, + DomTreeNode::const_iterator child, DomTreeNode::const_iterator end) : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child), EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableInvariants, - AvailableCalls) - {} + AvailableCalls, AvailableGEPs) {} StackNode(const StackNode &) = delete; StackNode &operator=(const StackNode &) = delete; @@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst, return Result; } +static void combineIRFlags(Instruction &From, Value *To) { + if (auto *I = dyn_cast(To)) { + // If I being poison triggers UB, there is no need to drop those + // flags. Otherwise, only retain flags present on both I and Inst. + // TODO: Currently some fast-math flags are not treated as + // poison-generating even though they should. Until this is fixed, + // always retain flags present on both I and Inst for floating point + // instructions. + if (isa(I) || + (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))) + I->andIRFlags(&From); + } +} + bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier, const ParseMemoryInst &Later) { // Can we remove Earlier store because of Later store? @@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (auto *I = dyn_cast(V)) { - // If I being poison triggers UB, there is no need to drop those - // flags. Otherwise, only retain flags present on both I and Inst. - // TODO: Currently some fast-math flags are not treated as - // poison-generating even though they should. Until this is fixed, - // always retain flags present on both I and Inst for floating point - // instructions. - if (isa(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))) - I->andIRFlags(&Inst); - } + combineIRFlags(Inst, V); Inst.replaceAllUsesWith(V); salvageKnowledge(&Inst, &AC); removeMSSA(Inst); @@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // Compare GEP instructions based on offset. + if (GEPValue::canHandle(&Inst)) { + auto *GEP = cast(&Inst); + APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0); + GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset) + ? Offset.trySExtValue() + : std::nullopt); + if (Value *V = AvailableGEPs.lookup(GEPVal)) { + LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << " to: " << *V + << '\n'); + combineIRFlags(Inst, V); + Inst.replaceAllUsesWith(V); + salvageKnowledge(&Inst, &AC); + removeMSSA(Inst); + Inst.eraseFromParent(); + Changed = true; + ++NumCSEGEP; + continue; + } + + // Otherwise, just remember that we have this GEP. + AvailableGEPs.insert(GEPVal, &Inst); + continue; + } + // A release fence requires that all stores complete before it, but does // not prevent the reordering of following loads 'before' the fence. As a // result, we don't need to consider it as writing to memory and don't need @@ -1675,7 +1785,7 @@ bool EarlyCSE::run() { // Process the root node. nodesToProcess.push_back(new StackNode( AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls, - CurrentGeneration, DT.getRootNode(), + AvailableGEPs, CurrentGeneration, DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end())); assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it."); @@ -1698,10 +1808,10 @@ bool EarlyCSE::run() { } else if (NodeToProcess->childIter() != NodeToProcess->end()) { // Push the next child onto the stack. DomTreeNode *child = NodeToProcess->nextChild(); - nodesToProcess.push_back( - new StackNode(AvailableValues, AvailableLoads, AvailableInvariants, - AvailableCalls, NodeToProcess->childGeneration(), - child, child->begin(), child->end())); + nodesToProcess.push_back(new StackNode( + AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls, + AvailableGEPs, NodeToProcess->childGeneration(), child, + child->begin(), child->end())); } else { // It has been processed, and there are no more children to process, // so delete it and pop it off the stack. diff --git a/llvm/test/Transforms/EarlyCSE/gep.ll b/llvm/test/Transforms/EarlyCSE/gep.ll new file mode 100644 index 0000000000000..499b5ac8de0af --- /dev/null +++ b/llvm/test/Transforms/EarlyCSE/gep.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s +; RUN: opt < %s -S -passes='early-cse' | FileCheck %s + +%T1 = type { i64, i64, i64 } + +declare void @use_vec(<4 x ptr>); + +define void @foo(ptr %a, <4 x i64> %b, i64 %i) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) { +; CHECK-NEXT: [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7 +; CHECK-NEXT: [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1 +; CHECK-NEXT: [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]] +; CHECK-NEXT: [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> +; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V]]) +; CHECK-NEXT: [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> +; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V2]]) +; CHECK-NEXT: ret void +; + %s1a = getelementptr i8, ptr %a, i64 8 + %s1av = load i64, ptr %s1a + %s1b = getelementptr inbounds i8, ptr %a, i64 8 + %s1bv = load i64, ptr %s1b + %s1c = getelementptr %T1, ptr %a, i64 0, i32 1 + %s1cv = load i64, ptr %s1c + %n1d = getelementptr i8, ptr %a, i64 7 + %n1dv = load i64, ptr %n1d + %s1e = getelementptr i64, ptr %a, i64 1 + %s1ev = load i64, ptr %s1e + %s1f = getelementptr i32, ptr %a, i64 2 + %s1fv = load i64, ptr %s1f + %n1g = getelementptr i32, ptr %a, i64 1 + %n1gv = load i64, ptr %n1g + %n1h = getelementptr i8, ptr %a, i64 %i + %n1hv = load i64, ptr %n1h + + %v = getelementptr i64, ptr %a, <4 x i64> + call void @use_vec(<4 x ptr> %v) + %v2 = getelementptr i64, ptr %a, <4 x i64> + call void @use_vec(<4 x ptr> %v2) + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll new file mode 100644 index 0000000000000..1c9e7a771ca19 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt < %s -O3 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } } + +define void @foo(ptr %a, <32 x i8> %_0) #0 { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: start: +; CHECK-NEXT: store <32 x i8> [[_0]], ptr [[A]], align 1 +; CHECK-NEXT: ret void +; +start: + %z = alloca %Zip, align 8 + %sroa_1 = getelementptr i8, ptr %z, i64 16 + store <32 x i8> %_0, ptr %sroa_1, align 8 + %len_ = getelementptr i8, ptr %z, i64 56 + store i64 32, ptr %len_, align 8 + %_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1 + %_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1 + %len = load i64, ptr %_2, align 8 + %_10 = getelementptr %Zip, ptr %z, i64 0, i32 1 + br label %body + +body: ; preds = %body, %start + %_34 = phi ptr [ %_34i, %body ], [ %a, %start ] + %idx = phi i64 [ %idx_, %body ], [ 0, %start ] + %_34i = getelementptr i8, ptr %_34, i64 1 + %idx_ = add i64 %idx, 1 + store i64 0, ptr %_1, align 8 + %_24 = getelementptr i8, ptr %_10, i64 %idx + %_18 = load i8, ptr %_24, align 1 + store i8 %_18, ptr %_34, align 1 + %_6 = icmp eq i64 %len, %idx_ + br i1 %_6, label %exit, label %body + +exit: ; preds = %body + ret void +} + +attributes #0 = { "target-cpu"="znver3" }