Skip to content

Commit 9552dd1

Browse files
committed
[PowerPC] avoid masking already-zero bits in BitPermutationSelector
The current BitPermutationSelector generates a code to build a value by tracking two types of bits: ConstZero and Variable. ConstZero means a bit we need to mask off and Variable is a bit we copy from an input value. This patch add third type of bits VariableKnownToBeZero caused by AssertZext node or zero-extending load node. VariableKnownToBeZero means a bit comes from an input value, but it is known to be already zero. So we do not need to mask them. VariableKnownToBeZero enhances flexibility to group bits, since we can avoid redundant masking for these bits. This patch also renames "HasZero" to "NeedMask" since now we may skip masking even when we have zeros (of type VariableKnownToBeZero). Differential Revision: https://reviews.llvm.org/D48025 llvm-svn: 344347
1 parent 6cbb3ca commit 9552dd1

File tree

5 files changed

+143
-28
lines changed

5 files changed

+143
-28
lines changed

llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

Lines changed: 104 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,9 +1083,14 @@ class BitPermutationSelector {
10831083
// lowest-order bit.
10841084
unsigned Idx;
10851085

1086+
// ConstZero means a bit we need to mask off.
1087+
// Variable is a bit comes from an input variable.
1088+
// VariableKnownToBeZero is also a bit comes from an input variable,
1089+
// but it is known to be already zero. So we do not need to mask them.
10861090
enum Kind {
10871091
ConstZero,
1088-
Variable
1092+
Variable,
1093+
VariableKnownToBeZero
10891094
} K;
10901095

10911096
ValueBit(SDValue V, unsigned I, Kind K = Variable)
@@ -1094,11 +1099,11 @@ class BitPermutationSelector {
10941099
: V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
10951100

10961101
bool isZero() const {
1097-
return K == ConstZero;
1102+
return K == ConstZero || K == VariableKnownToBeZero;
10981103
}
10991104

11001105
bool hasValue() const {
1101-
return K == Variable;
1106+
return K == Variable || K == VariableKnownToBeZero;
11021107
}
11031108

11041109
SDValue getValue() const {
@@ -1248,8 +1253,14 @@ class BitPermutationSelector {
12481253
for (unsigned i = 0; i < NumBits; ++i)
12491254
if (((Mask >> i) & 1) == 1)
12501255
Bits[i] = (*LHSBits)[i];
1251-
else
1252-
Bits[i] = ValueBit(ValueBit::ConstZero);
1256+
else {
1257+
// AND instruction masks this bit. If the input is already zero,
1258+
// we have nothing to do here. Otherwise, make the bit ConstZero.
1259+
if ((*LHSBits)[i].isZero())
1260+
Bits[i] = (*LHSBits)[i];
1261+
else
1262+
Bits[i] = ValueBit(ValueBit::ConstZero);
1263+
}
12531264

12541265
return std::make_pair(Interesting, &Bits);
12551266
}
@@ -1259,15 +1270,43 @@ class BitPermutationSelector {
12591270
const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
12601271

12611272
bool AllDisjoint = true;
1262-
for (unsigned i = 0; i < NumBits; ++i)
1263-
if (LHSBits[i].isZero())
1273+
SDValue LastVal = SDValue();
1274+
unsigned LastIdx = 0;
1275+
for (unsigned i = 0; i < NumBits; ++i) {
1276+
if (LHSBits[i].isZero() && RHSBits[i].isZero()) {
1277+
// If both inputs are known to be zero and one is ConstZero and
1278+
// another is VariableKnownToBeZero, we can select whichever
1279+
// we like. To minimize the number of bit groups, we select
1280+
// VariableKnownToBeZero if this bit is the next bit of the same
1281+
// input variable from the previous bit. Otherwise, we select
1282+
// ConstZero.
1283+
if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal &&
1284+
LHSBits[i].getValueBitIndex() == LastIdx + 1)
1285+
Bits[i] = LHSBits[i];
1286+
else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal &&
1287+
RHSBits[i].getValueBitIndex() == LastIdx + 1)
1288+
Bits[i] = RHSBits[i];
1289+
else
1290+
Bits[i] = ValueBit(ValueBit::ConstZero);
1291+
}
1292+
else if (LHSBits[i].isZero())
12641293
Bits[i] = RHSBits[i];
12651294
else if (RHSBits[i].isZero())
12661295
Bits[i] = LHSBits[i];
12671296
else {
12681297
AllDisjoint = false;
12691298
break;
12701299
}
1300+
// We remember the value and bit index of this bit.
1301+
if (Bits[i].hasValue()) {
1302+
LastVal = Bits[i].getValue();
1303+
LastIdx = Bits[i].getValueBitIndex();
1304+
}
1305+
else {
1306+
if (LastVal) LastVal = SDValue();
1307+
LastIdx = 0;
1308+
}
1309+
}
12711310

12721311
if (!AllDisjoint)
12731312
break;
@@ -1293,6 +1332,44 @@ class BitPermutationSelector {
12931332

12941333
return std::make_pair(Interesting, &Bits);
12951334
}
1335+
case ISD::AssertZext: {
1336+
// For AssertZext, we look through the operand and
1337+
// mark the bits known to be zero.
1338+
const SmallVector<ValueBit, 64> *LHSBits;
1339+
std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
1340+
NumBits);
1341+
1342+
EVT FromType = cast<VTSDNode>(V.getOperand(1))->getVT();
1343+
const unsigned NumValidBits = FromType.getSizeInBits();
1344+
for (unsigned i = 0; i < NumValidBits; ++i)
1345+
Bits[i] = (*LHSBits)[i];
1346+
1347+
// These bits are known to be zero.
1348+
for (unsigned i = NumValidBits; i < NumBits; ++i)
1349+
Bits[i] = ValueBit((*LHSBits)[i].getValue(),
1350+
(*LHSBits)[i].getValueBitIndex(),
1351+
ValueBit::VariableKnownToBeZero);
1352+
1353+
return std::make_pair(Interesting, &Bits);
1354+
}
1355+
case ISD::LOAD:
1356+
LoadSDNode *LD = cast<LoadSDNode>(V);
1357+
if (ISD::isZEXTLoad(V.getNode()) && V.getResNo() == 0) {
1358+
EVT VT = LD->getMemoryVT();
1359+
const unsigned NumValidBits = VT.getSizeInBits();
1360+
1361+
for (unsigned i = 0; i < NumValidBits; ++i)
1362+
Bits[i] = ValueBit(V, i);
1363+
1364+
// These bits are known to be zero.
1365+
for (unsigned i = NumValidBits; i < NumBits; ++i)
1366+
Bits[i] = ValueBit(V, i, ValueBit::VariableKnownToBeZero);
1367+
1368+
// Zero-extending load itself cannot be optimized. So, it is not
1369+
// interesting by itself though it gives useful information.
1370+
return std::make_pair(Interesting = false, &Bits);
1371+
}
1372+
break;
12961373
}
12971374

12981375
for (unsigned i = 0; i < NumBits; ++i)
@@ -1304,7 +1381,7 @@ class BitPermutationSelector {
13041381
// For each value (except the constant ones), compute the left-rotate amount
13051382
// to get it from its original to final position.
13061383
void computeRotationAmounts() {
1307-
HasZeros = false;
1384+
NeedMask = false;
13081385
RLAmt.resize(Bits.size());
13091386
for (unsigned i = 0; i < Bits.size(); ++i)
13101387
if (Bits[i].hasValue()) {
@@ -1314,7 +1391,7 @@ class BitPermutationSelector {
13141391
else
13151392
RLAmt[i] = Bits.size() - (VBI - i);
13161393
} else if (Bits[i].isZero()) {
1317-
HasZeros = true;
1394+
NeedMask = true;
13181395
RLAmt[i] = UINT32_MAX;
13191396
} else {
13201397
llvm_unreachable("Unknown value bit type");
@@ -1330,6 +1407,7 @@ class BitPermutationSelector {
13301407
unsigned LastRLAmt = RLAmt[0];
13311408
SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
13321409
unsigned LastGroupStartIdx = 0;
1410+
bool IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
13331411
for (unsigned i = 1; i < Bits.size(); ++i) {
13341412
unsigned ThisRLAmt = RLAmt[i];
13351413
SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
@@ -1342,17 +1420,28 @@ class BitPermutationSelector {
13421420
LastGroupStartIdx = 0;
13431421
}
13441422

1423+
// If this bit is known to be zero and the current group is a bit group
1424+
// of zeros, we do not need to terminate the current bit group even the
1425+
// Value or RLAmt does not match here. Instead, we terminate this group
1426+
// when the first non-zero bit appears later.
1427+
if (IsGroupOfZeros && Bits[i].isZero())
1428+
continue;
1429+
13451430
// If this bit has the same underlying value and the same rotate factor as
13461431
// the last one, then they're part of the same group.
13471432
if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
1348-
continue;
1433+
// We cannot continue the current group if this bits is not known to
1434+
// be zero in a bit group of zeros.
1435+
if (!(IsGroupOfZeros && ThisValue && !Bits[i].isZero()))
1436+
continue;
13491437

13501438
if (LastValue.getNode())
13511439
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
13521440
i-1));
13531441
LastRLAmt = ThisRLAmt;
13541442
LastValue = ThisValue;
13551443
LastGroupStartIdx = i;
1444+
IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
13561445
}
13571446
if (LastValue.getNode())
13581447
BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
@@ -1698,7 +1787,7 @@ class BitPermutationSelector {
16981787
// If we've not yet selected a 'starting' instruction, and we have no zeros
16991788
// to fill in, select the (Value, RLAmt) with the highest priority (largest
17001789
// number of groups), and start with this rotated value.
1701-
if ((!HasZeros || LateMask) && !Res) {
1790+
if ((!NeedMask || LateMask) && !Res) {
17021791
ValueRotInfo &VRI = ValueRotsVec[0];
17031792
if (VRI.RLAmt) {
17041793
if (InstCnt) *InstCnt += 1;
@@ -2077,7 +2166,7 @@ class BitPermutationSelector {
20772166
// If we've not yet selected a 'starting' instruction, and we have no zeros
20782167
// to fill in, select the (Value, RLAmt) with the highest priority (largest
20792168
// number of groups), and start with this rotated value.
2080-
if ((!HasZeros || LateMask) && !Res) {
2169+
if ((!NeedMask || LateMask) && !Res) {
20812170
// If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
20822171
// groups will come first, and so the VRI representing the largest number
20832172
// of groups might not be first (it might be the first Repl32 groups).
@@ -2230,7 +2319,7 @@ class BitPermutationSelector {
22302319

22312320
SmallVector<ValueBit, 64> Bits;
22322321

2233-
bool HasZeros;
2322+
bool NeedMask;
22342323
SmallVector<unsigned, 64> RLAmt;
22352324

22362325
SmallVector<BitGroup, 16> BitGroups;
@@ -2259,10 +2348,10 @@ class BitPermutationSelector {
22592348
" selection for: ");
22602349
LLVM_DEBUG(N->dump(CurDAG));
22612350

2262-
// Fill it RLAmt and set HasZeros.
2351+
// Fill it RLAmt and set NeedMask.
22632352
computeRotationAmounts();
22642353

2265-
if (!HasZeros)
2354+
if (!NeedMask)
22662355
return Select(N, false);
22672356

22682357
// We currently have two techniques for handling results with zeros: early

llvm/test/CodeGen/PowerPC/addi-offset-fold.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@ entry:
2727
; FIXME: We don't need to do these stores at all.
2828
; CHECK-DAG: std 3, -24(1)
2929
; CHECK-DAG: stb 4, -16(1)
30-
; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32
3130
; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
32-
; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
33-
; CHECK: rldicl 3, [[REG4]], 33, 57
31+
; CHECK-DAG: rlwinm 3, [[REG2]], 1, 31, 31
32+
; CHECK: rlwimi 3, 4, 1, 25, 30
3433
; CHECK: blr
3534
}
3635

llvm/test/CodeGen/PowerPC/bitfieldinsert.ll

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,35 @@
11
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
22
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
33

4+
; equivalent C code
5+
; struct s64 {
6+
; int a:5;
7+
; int b:16;
8+
; long c:42;
9+
; };
10+
; void bitfieldinsert64(struct s *p, unsigned short v) {
11+
; p->b = v;
12+
; }
13+
14+
%struct.s64 = type { i64 }
15+
16+
define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) {
17+
; CHECK-LABEL: @bitfieldinsert64
18+
; CHECK: ld [[REG1:[0-9]+]], 0(3)
19+
; CHECK-NEXT: rlwimi [[REG1]], 4, 5, 11, 26
20+
; CHECK-NEXT: std [[REG1]], 0(3)
21+
; CHECK-NEXT: blr
22+
entry:
23+
%0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0
24+
%1 = zext i16 %v to i64
25+
%bf.load = load i64, i64* %0, align 8
26+
%bf.shl = shl nuw nsw i64 %1, 5
27+
%bf.clear = and i64 %bf.load, -2097121
28+
%bf.set = or i64 %bf.clear, %bf.shl
29+
store i64 %bf.set, i64* %0, align 8
30+
ret void
31+
}
32+
433
; bitfieldinsert32: Test for rlwimi
534
; equivalent C code
635
; struct s32 {
@@ -17,9 +46,9 @@
1746
define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) {
1847
; CHECK-LABEL: @bitfieldinsert32
1948
; CHECK: lwz [[REG1:[0-9]+]], 0(3)
20-
; CHECK: rlwimi [[REG1]], 4, 8, 8, 23
21-
; CHECK: stw [[REG1]], 0(3)
22-
; CHECK: blr
49+
; CHECK-NEXT: rlwimi [[REG1]], 4, 8, 8, 23
50+
; CHECK-NEXT: stw [[REG1]], 0(3)
51+
; CHECK-NEXT: blr
2352
entry:
2453
%0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0
2554
%bf.load = load i32, i32* %0, align 4

llvm/test/CodeGen/PowerPC/ppc64le-aggregates.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,14 +236,12 @@ entry:
236236
; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
237237
; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
238238
; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
239-
; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
239+
; CHECK-DAG: lwz 9, [[OFF0]](1)
240240
; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
241-
; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
241+
; CHECK-DAG: lwz 10, [[OFF2]](1)
242242
; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
243-
; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
244-
; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
245-
; CHECK-DAG: or 9, [[REG0]], [[REG1]]
246-
; CHECK-DAG: or 10, [[REG2]], [[REG3]]
243+
; CHECK-DAG: rldimi 9, [[REG1]], 32, 0
244+
; CHECK-DAG: rldimi 10, [[REG3]], 32, 0
247245
; CHECK: bl test1
248246

249247
declare void @test1([8 x float], [8 x float])

llvm/test/CodeGen/PowerPC/rlwimi-dyn-and.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ next:
3939
ret i32 %conv174
4040

4141
; CHECK-LABEL: @test2
42-
; CHECK: slwi 3, {{[0-9]+}}, 7
42+
; CHECK: rlwinm 3, {{[0-9]+}}, 7, 17, 24
4343
; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
4444
; CHECK: blr
4545
}

0 commit comments

Comments
 (0)