Skip to content

Commit 2d1e8a0

Browse files
authored
[EarlyCSE] Compare GEP instructions based on offset (#65875)
Closes #65763. This will provide more opportunities for constant propagation for subsequent optimizations.
1 parent 60a227c commit 2d1e8a0

File tree

3 files changed

+229
-31
lines changed

3 files changed

+229
-31
lines changed

llvm/lib/Transforms/Scalar/EarlyCSE.cpp

+141-31
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ STATISTIC(NumCSE, "Number of instructions CSE'd");
6767
STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
6868
STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
6969
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
70+
STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
7071
STATISTIC(NumDSE, "Number of trivial dead stores removed");
7172

7273
DEBUG_COUNTER(CSECounter, "early-cse",
@@ -143,11 +144,11 @@ struct SimpleValue {
143144
!CI->getFunction()->isPresplitCoroutine();
144145
}
145146
return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
146-
isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
147-
isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
148-
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
149-
isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
150-
isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
147+
isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
148+
isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
149+
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
150+
isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
151+
isa<FreezeInst>(Inst);
151152
}
152153
};
153154

@@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
307308
IVI->getOperand(1),
308309
hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
309310

310-
assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
311-
isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
312-
isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
313-
isa<FreezeInst>(Inst)) &&
311+
assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
312+
isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
313+
isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
314314
"Invalid/unknown instruction");
315315

316316
// Handle intrinsics with commutative operands.
@@ -548,11 +548,81 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
548548
// currently executing, so conservatively return false if they are in
549549
// different basic blocks.
550550
if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
551-
return false;
551+
return false;
552552

553553
return LHSI->isIdenticalTo(RHSI);
554554
}
555555

556+
//===----------------------------------------------------------------------===//
557+
// GEPValue
558+
//===----------------------------------------------------------------------===//
559+
560+
namespace {
561+
562+
struct GEPValue {
563+
Instruction *Inst;
564+
std::optional<int64_t> ConstantOffset;
565+
566+
GEPValue(Instruction *I) : Inst(I) {
567+
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
568+
}
569+
570+
GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
571+
: Inst(I), ConstantOffset(ConstantOffset) {
572+
assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
573+
}
574+
575+
bool isSentinel() const {
576+
return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
577+
Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
578+
}
579+
580+
static bool canHandle(Instruction *Inst) {
581+
return isa<GetElementPtrInst>(Inst);
582+
}
583+
};
584+
585+
} // namespace
586+
587+
namespace llvm {
588+
589+
template <> struct DenseMapInfo<GEPValue> {
590+
static inline GEPValue getEmptyKey() {
591+
return DenseMapInfo<Instruction *>::getEmptyKey();
592+
}
593+
594+
static inline GEPValue getTombstoneKey() {
595+
return DenseMapInfo<Instruction *>::getTombstoneKey();
596+
}
597+
598+
static unsigned getHashValue(const GEPValue &Val);
599+
static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
600+
};
601+
602+
} // end namespace llvm
603+
604+
unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
605+
auto *GEP = cast<GetElementPtrInst>(Val.Inst);
606+
if (Val.ConstantOffset.has_value())
607+
return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
608+
Val.ConstantOffset.value());
609+
return hash_combine(
610+
GEP->getOpcode(),
611+
hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
612+
}
613+
614+
bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
615+
if (LHS.isSentinel() || RHS.isSentinel())
616+
return LHS.Inst == RHS.Inst;
617+
auto *LGEP = cast<GetElementPtrInst>(LHS.Inst);
618+
auto *RGEP = cast<GetElementPtrInst>(RHS.Inst);
619+
if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
620+
return false;
621+
if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
622+
return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
623+
return LGEP->isIdenticalToWhenDefined(RGEP);
624+
}
625+
556626
//===----------------------------------------------------------------------===//
557627
// EarlyCSE implementation
558628
//===----------------------------------------------------------------------===//
@@ -647,6 +717,13 @@ class EarlyCSE {
647717
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
648718
CallHTType AvailableCalls;
649719

720+
using GEPMapAllocatorTy =
721+
RecyclingAllocator<BumpPtrAllocator,
722+
ScopedHashTableVal<GEPValue, Value *>>;
723+
using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
724+
GEPMapAllocatorTy>;
725+
GEPHTType AvailableGEPs;
726+
650727
/// This is the current generation of the memory value.
651728
unsigned CurrentGeneration = 0;
652729

@@ -667,9 +744,11 @@ class EarlyCSE {
667744
class NodeScope {
668745
public:
669746
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
670-
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
671-
: Scope(AvailableValues), LoadScope(AvailableLoads),
672-
InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
747+
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
748+
GEPHTType &AvailableGEPs)
749+
: Scope(AvailableValues), LoadScope(AvailableLoads),
750+
InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
751+
GEPScope(AvailableGEPs) {}
673752
NodeScope(const NodeScope &) = delete;
674753
NodeScope &operator=(const NodeScope &) = delete;
675754

@@ -678,6 +757,7 @@ class EarlyCSE {
678757
LoadHTType::ScopeTy LoadScope;
679758
InvariantHTType::ScopeTy InvariantScope;
680759
CallHTType::ScopeTy CallScope;
760+
GEPHTType::ScopeTy GEPScope;
681761
};
682762

683763
// Contains all the needed information to create a stack for doing a depth
@@ -688,13 +768,13 @@ class EarlyCSE {
688768
public:
689769
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
690770
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
691-
unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
771+
GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
772+
DomTreeNode::const_iterator child,
692773
DomTreeNode::const_iterator end)
693774
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
694775
EndIter(end),
695776
Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
696-
AvailableCalls)
697-
{}
777+
AvailableCalls, AvailableGEPs) {}
698778
StackNode(const StackNode &) = delete;
699779
StackNode &operator=(const StackNode &) = delete;
700780

@@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
12141294
return Result;
12151295
}
12161296

1297+
static void combineIRFlags(Instruction &From, Value *To) {
1298+
if (auto *I = dyn_cast<Instruction>(To)) {
1299+
// If I being poison triggers UB, there is no need to drop those
1300+
// flags. Otherwise, only retain flags present on both I and Inst.
1301+
// TODO: Currently some fast-math flags are not treated as
1302+
// poison-generating even though they should. Until this is fixed,
1303+
// always retain flags present on both I and Inst for floating point
1304+
// instructions.
1305+
if (isa<FPMathOperator>(I) ||
1306+
(I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
1307+
I->andIRFlags(&From);
1308+
}
1309+
}
1310+
12171311
bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
12181312
const ParseMemoryInst &Later) {
12191313
// Can we remove Earlier store because of Later store?
@@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
14391533
LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
14401534
continue;
14411535
}
1442-
if (auto *I = dyn_cast<Instruction>(V)) {
1443-
// If I being poison triggers UB, there is no need to drop those
1444-
// flags. Otherwise, only retain flags present on both I and Inst.
1445-
// TODO: Currently some fast-math flags are not treated as
1446-
// poison-generating even though they should. Until this is fixed,
1447-
// always retain flags present on both I and Inst for floating point
1448-
// instructions.
1449-
if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
1450-
I->andIRFlags(&Inst);
1451-
}
1536+
combineIRFlags(Inst, V);
14521537
Inst.replaceAllUsesWith(V);
14531538
salvageKnowledge(&Inst, &AC);
14541539
removeMSSA(Inst);
@@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
15611646
continue;
15621647
}
15631648

1649+
// Compare GEP instructions based on offset.
1650+
if (GEPValue::canHandle(&Inst)) {
1651+
auto *GEP = cast<GetElementPtrInst>(&Inst);
1652+
APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
1653+
GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset)
1654+
? Offset.trySExtValue()
1655+
: std::nullopt);
1656+
if (Value *V = AvailableGEPs.lookup(GEPVal)) {
1657+
LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << " to: " << *V
1658+
<< '\n');
1659+
combineIRFlags(Inst, V);
1660+
Inst.replaceAllUsesWith(V);
1661+
salvageKnowledge(&Inst, &AC);
1662+
removeMSSA(Inst);
1663+
Inst.eraseFromParent();
1664+
Changed = true;
1665+
++NumCSEGEP;
1666+
continue;
1667+
}
1668+
1669+
// Otherwise, just remember that we have this GEP.
1670+
AvailableGEPs.insert(GEPVal, &Inst);
1671+
continue;
1672+
}
1673+
15641674
// A release fence requires that all stores complete before it, but does
15651675
// not prevent the reordering of following loads 'before' the fence. As a
15661676
// result, we don't need to consider it as writing to memory and don't need
@@ -1675,7 +1785,7 @@ bool EarlyCSE::run() {
16751785
// Process the root node.
16761786
nodesToProcess.push_back(new StackNode(
16771787
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
1678-
CurrentGeneration, DT.getRootNode(),
1788+
AvailableGEPs, CurrentGeneration, DT.getRootNode(),
16791789
DT.getRootNode()->begin(), DT.getRootNode()->end()));
16801790

16811791
assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
@@ -1698,10 +1808,10 @@ bool EarlyCSE::run() {
16981808
} else if (NodeToProcess->childIter() != NodeToProcess->end()) {
16991809
// Push the next child onto the stack.
17001810
DomTreeNode *child = NodeToProcess->nextChild();
1701-
nodesToProcess.push_back(
1702-
new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
1703-
AvailableCalls, NodeToProcess->childGeneration(),
1704-
child, child->begin(), child->end()));
1811+
nodesToProcess.push_back(new StackNode(
1812+
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
1813+
AvailableGEPs, NodeToProcess->childGeneration(), child,
1814+
child->begin(), child->end()));
17051815
} else {
17061816
// It has been processed, and there are no more children to process,
17071817
// so delete it and pop it off the stack.

llvm/test/Transforms/EarlyCSE/gep.ll

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2+
; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
3+
; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s
4+
5+
%T1 = type { i64, i64, i64 }
6+
7+
declare void @use_vec(<4 x ptr>);
8+
9+
define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
10+
; CHECK-LABEL: define void @foo(
11+
; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
12+
; CHECK-NEXT: [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
13+
; CHECK-NEXT: [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
14+
; CHECK-NEXT: [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
15+
; CHECK-NEXT: [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
16+
; CHECK-NEXT: [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
17+
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V]])
18+
; CHECK-NEXT: [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
19+
; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V2]])
20+
; CHECK-NEXT: ret void
21+
;
22+
%s1a = getelementptr i8, ptr %a, i64 8
23+
%s1av = load i64, ptr %s1a
24+
%s1b = getelementptr inbounds i8, ptr %a, i64 8
25+
%s1bv = load i64, ptr %s1b
26+
%s1c = getelementptr %T1, ptr %a, i64 0, i32 1
27+
%s1cv = load i64, ptr %s1c
28+
%n1d = getelementptr i8, ptr %a, i64 7
29+
%n1dv = load i64, ptr %n1d
30+
%s1e = getelementptr i64, ptr %a, i64 1
31+
%s1ev = load i64, ptr %s1e
32+
%s1f = getelementptr i32, ptr %a, i64 2
33+
%s1fv = load i64, ptr %s1f
34+
%n1g = getelementptr i32, ptr %a, i64 1
35+
%n1gv = load i64, ptr %n1g
36+
%n1h = getelementptr i8, ptr %a, i64 %i
37+
%n1hv = load i64, ptr %n1h
38+
39+
%v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
40+
call void @use_vec(<4 x ptr> %v)
41+
%v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
42+
call void @use_vec(<4 x ptr> %v2)
43+
ret void
44+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2+
; RUN: opt < %s -O3 -S | FileCheck %s
3+
4+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
5+
target triple = "x86_64-unknown-linux-gnu"
6+
7+
%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }
8+
9+
define void @foo(ptr %a, <32 x i8> %_0) #0 {
10+
; CHECK-LABEL: define void @foo(
11+
; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: start:
13+
; CHECK-NEXT: store <32 x i8> [[_0]], ptr [[A]], align 1
14+
; CHECK-NEXT: ret void
15+
;
16+
start:
17+
%z = alloca %Zip, align 8
18+
%sroa_1 = getelementptr i8, ptr %z, i64 16
19+
store <32 x i8> %_0, ptr %sroa_1, align 8
20+
%len_ = getelementptr i8, ptr %z, i64 56
21+
store i64 32, ptr %len_, align 8
22+
%_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
23+
%_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
24+
%len = load i64, ptr %_2, align 8
25+
%_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
26+
br label %body
27+
28+
body: ; preds = %body, %start
29+
%_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
30+
%idx = phi i64 [ %idx_, %body ], [ 0, %start ]
31+
%_34i = getelementptr i8, ptr %_34, i64 1
32+
%idx_ = add i64 %idx, 1
33+
store i64 0, ptr %_1, align 8
34+
%_24 = getelementptr i8, ptr %_10, i64 %idx
35+
%_18 = load i8, ptr %_24, align 1
36+
store i8 %_18, ptr %_34, align 1
37+
%_6 = icmp eq i64 %len, %idx_
38+
br i1 %_6, label %exit, label %body
39+
40+
exit: ; preds = %body
41+
ret void
42+
}
43+
44+
attributes #0 = { "target-cpu"="znver3" }

0 commit comments

Comments
 (0)