Skip to content

Commit 90e9895

Browse files
authored
[X86] Handle BSF/BSR "zero-input pass through" behaviour (#123623)
Intel docs have been updated to be similar to AMD and now describe BSF/BSR as not changing the destination register if the input value was zero, which allows us to support CTTZ/CTLZ zero-input cases by setting the destination to support a NumBits result (BSR is a bit messy as it has to be XOR'd to create a CTLZ result). VIA/Zhaoxin x86_64 CPUs have also been confirmed to match this behaviour. This patch adjusts the X86ISD::BSF/BSR nodes to take a "pass through" argument for zero-input cases, by default this is set to UNDEF to match existing behaviour, but it can be set to a suitable value if supported. There are still some limits to this - its only supported for x86_64 capable processors (and I've only enabled it for x86_64 codegen), and Intel CPUs sometimes zero the upper 32-bits of a pass through register when used for BSR32/BSF32 with a zero source value (i.e. the whole 64bits may not get passed through). Fixes #122004
1 parent 0e944a3 commit 90e9895

18 files changed

+298
-365
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+51-13
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
436436
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
437437
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
438438
if (Subtarget.is64Bit()) {
439-
setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
440439
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
441440
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
442441
}
@@ -3386,15 +3385,19 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
33863385
}
33873386

33883387
bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
3389-
// Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
3388+
// Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3389+
// i32/i64 or can rely on BSF passthrough value.
33903390
return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3391+
Subtarget.hasBitScanPassThrough() ||
33913392
(!Ty->isVectorTy() &&
33923393
Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
33933394
}
33943395

33953396
bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
3396-
// Speculate ctlz only if we can directly use LZCNT.
3397-
return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
3397+
// Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3398+
// passthrough value.
3399+
return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3400+
Subtarget.hasBitScanPassThrough();
33983401
}
33993402

34003403
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
@@ -28694,11 +28697,18 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
2869428697
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
2869528698
}
2869628699

28700+
// Check if we can safely pass a result though BSR for zero sources.
28701+
SDValue PassThru = DAG.getUNDEF(OpVT);
28702+
if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
28703+
!DAG.isKnownNeverZero(Op))
28704+
PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
28705+
2869728706
// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
2869828707
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28699-
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28708+
Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
2870028709

28701-
if (Opc == ISD::CTLZ) {
28710+
// Skip CMOV if we're using a pass through value.
28711+
if (Opc == ISD::CTLZ && PassThru.isUndef()) {
2870228712
// If src is zero (i.e. bsr sets ZF), returns NumBits.
2870328713
SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
2870428714
DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
@@ -28721,16 +28731,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
2872128731
unsigned NumBits = VT.getScalarSizeInBits();
2872228732
SDValue N0 = Op.getOperand(0);
2872328733
SDLoc dl(Op);
28734+
bool NonZeroSrc = DAG.isKnownNeverZero(N0);
2872428735

2872528736
assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
2872628737
"Only scalar CTTZ requires custom lowering");
2872728738

28739+
// Check if we can safely pass a result though BSF for zero sources.
28740+
SDValue PassThru = DAG.getUNDEF(VT);
28741+
if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
28742+
PassThru = DAG.getConstant(NumBits, dl, VT);
28743+
2872828744
// Issue a bsf (scan bits forward) which also sets EFLAGS.
2872928745
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28730-
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28746+
Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
2873128747

28732-
// If src is known never zero we can skip the CMOV.
28733-
if (DAG.isKnownNeverZero(N0))
28748+
// Skip CMOV if src is never zero or we're using a pass through value.
28749+
if (NonZeroSrc || !PassThru.isUndef())
2873428750
return Op;
2873528751

2873628752
// If src is zero (i.e. bsf sets ZF), returns NumBits.
@@ -38193,12 +38209,34 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3819338209
Known = KnownBits::mul(Known, Known2);
3819438210
break;
3819538211
}
38196-
case X86ISD::BSR:
38197-
// BSR(0) is undef, but any use of BSR already accounts for non-zero inputs.
38198-
// Similar KnownBits behaviour to CTLZ_ZERO_UNDEF.
38212+
case X86ISD::BSF: {
38213+
Known.Zero.setBitsFrom(Log2_32(BitWidth));
38214+
38215+
KnownBits Known2;
38216+
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38217+
if (Known2.isNonZero()) {
38218+
// If we have a known 1, its position is our upper bound.
38219+
unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38220+
unsigned LowBits = llvm::bit_width(PossibleTZ);
38221+
Known.Zero.setBitsFrom(LowBits);
38222+
} else if (!Op.getOperand(0).isUndef()) {
38223+
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38224+
Known = Known.intersectWith(Known2);
38225+
}
38226+
break;
38227+
}
38228+
case X86ISD::BSR: {
3819938229
// TODO: Bound with input known bits?
3820038230
Known.Zero.setBitsFrom(Log2_32(BitWidth));
38231+
38232+
if (!Op.getOperand(0).isUndef() &&
38233+
!DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38234+
KnownBits Known2;
38235+
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38236+
Known = Known.intersectWith(Known2);
38237+
}
3820138238
break;
38239+
}
3820238240
case X86ISD::SETCC:
3820338241
Known.Zero.setBitsFrom(1);
3820438242
break;
@@ -54243,7 +54281,7 @@ static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
5424354281
}
5424454282

5424554283
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
54246-
Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
54284+
Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
5424754285
if (VT == MVT::i8)
5424854286
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
5424954287

llvm/lib/Target/X86/X86InstrCompiler.td

+6-6
Original file line numberDiff line numberDiff line change
@@ -2213,12 +2213,12 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
22132213
(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
22142214

22152215
// Bit scan instruction patterns to match explicit zero-undef behavior.
2216-
def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
2217-
def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
2218-
def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
2219-
def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
2220-
def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
2221-
def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
2216+
def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>;
2217+
def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>;
2218+
def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>;
2219+
def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>;
2220+
def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>;
2221+
def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>;
22222222

22232223
// When HasMOVBE is enabled it is possible to get a non-legalized
22242224
// register-register 16 bit bswap. This maps it to a ROL instruction.

llvm/lib/Target/X86/X86InstrFragments.td

+8-5
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,8 @@ def SDTX86Cmpccxadd : SDTypeProfile<1, 4, [SDTCisSameAs<0, 2>,
134134
def X86MFence : SDNode<"X86ISD::MFENCE", SDTNone, [SDNPHasChain]>;
135135

136136

137-
def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
138-
def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
137+
def X86bsf : SDNode<"X86ISD::BSF", SDTBinaryArithWithFlags>;
138+
def X86bsr : SDNode<"X86ISD::BSR", SDTBinaryArithWithFlags>;
139139
def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
140140
def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;
141141

@@ -685,16 +685,19 @@ def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
685685
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
686686
// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
687687
// anything about the upper 32 bits, they're probably just qualifying a
688-
// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
689-
// operation will zero-extend up to 64 bits.
688+
// CopyFromReg. FREEZE may be coming from a a truncate. BitScan fall through
689+
// values may not zero the upper bits correctly.
690+
// Any other 32-bit operation will zero-extend up to 64 bits.
690691
def def32 : PatLeaf<(i32 GR32:$src), [{
691692
return N->getOpcode() != ISD::TRUNCATE &&
692693
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
693694
N->getOpcode() != ISD::CopyFromReg &&
694695
N->getOpcode() != ISD::AssertSext &&
695696
N->getOpcode() != ISD::AssertZext &&
696697
N->getOpcode() != ISD::AssertAlign &&
697-
N->getOpcode() != ISD::FREEZE;
698+
N->getOpcode() != ISD::FREEZE &&
699+
!((N->getOpcode() == X86ISD::BSF || N->getOpcode() == X86ISD::BSR) &&
700+
(!N->getOperand(0).isUndef() && !isa<ConstantSDNode>(N->getOperand(0))));
698701
}]>;
699702

700703
// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.

llvm/lib/Target/X86/X86InstrInfo.cpp

+14-12
Original file line numberDiff line numberDiff line change
@@ -5220,42 +5220,43 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
52205220
}
52215221

52225222
/// Check whether the use can be converted to remove a comparison against zero.
5223-
static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
5223+
/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5224+
static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
52245225
switch (MI.getOpcode()) {
52255226
default:
5226-
return X86::COND_INVALID;
5227+
return std::make_pair(X86::COND_INVALID, ~0U);
52275228
CASE_ND(NEG8r)
52285229
CASE_ND(NEG16r)
52295230
CASE_ND(NEG32r)
52305231
CASE_ND(NEG64r)
5231-
return X86::COND_AE;
5232+
return std::make_pair(X86::COND_AE, 1U);
52325233
case X86::LZCNT16rr:
52335234
case X86::LZCNT32rr:
52345235
case X86::LZCNT64rr:
5235-
return X86::COND_B;
5236+
return std::make_pair(X86::COND_B, 1U);
52365237
case X86::POPCNT16rr:
52375238
case X86::POPCNT32rr:
52385239
case X86::POPCNT64rr:
5239-
return X86::COND_E;
5240+
return std::make_pair(X86::COND_E, 1U);
52405241
case X86::TZCNT16rr:
52415242
case X86::TZCNT32rr:
52425243
case X86::TZCNT64rr:
5243-
return X86::COND_B;
5244+
return std::make_pair(X86::COND_B, 1U);
52445245
case X86::BSF16rr:
52455246
case X86::BSF32rr:
52465247
case X86::BSF64rr:
52475248
case X86::BSR16rr:
52485249
case X86::BSR32rr:
52495250
case X86::BSR64rr:
5250-
return X86::COND_E;
5251+
return std::make_pair(X86::COND_E, 2U);
52515252
case X86::BLSI32rr:
52525253
case X86::BLSI64rr:
5253-
return X86::COND_AE;
5254+
return std::make_pair(X86::COND_AE, 1U);
52545255
case X86::BLSR32rr:
52555256
case X86::BLSR64rr:
52565257
case X86::BLSMSK32rr:
52575258
case X86::BLSMSK64rr:
5258-
return X86::COND_B;
5259+
return std::make_pair(X86::COND_B, 1U);
52595260
// TODO: TBM instructions.
52605261
}
52615262
}
@@ -5336,6 +5337,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
53365337
bool ClearsOverflowFlag = false;
53375338
bool ShouldUpdateCC = false;
53385339
bool IsSwapped = false;
5340+
unsigned OpNo = 0;
53395341
X86::CondCode NewCC = X86::COND_INVALID;
53405342
int64_t ImmDelta = 0;
53415343

@@ -5391,9 +5393,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
53915393
// ... // EFLAGS not changed
53925394
// testl %eax, %eax // <-- can be removed
53935395
if (IsCmpZero) {
5394-
NewCC = isUseDefConvertible(Inst);
5395-
if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
5396-
Inst.getOperand(1).getReg() == SrcReg) {
5396+
std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5397+
if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5398+
Inst.getOperand(OpNo).getReg() == SrcReg) {
53975399
ShouldUpdateCC = true;
53985400
MI = &Inst;
53995401
break;

llvm/lib/Target/X86/X86InstrMisc.td

+25-25
Original file line numberDiff line numberDiff line change
@@ -247,55 +247,55 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
247247
} // Constraints = "$src = $dst", SchedRW
248248

249249
// Bit scan instructions.
250-
let Defs = [EFLAGS] in {
251-
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
250+
let Defs = [EFLAGS], Constraints = "$fallback = $dst" in {
251+
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
252252
"bsf{w}\t{$src, $dst|$dst, $src}",
253-
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
253+
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, GR16:$src))]>,
254254
TB, OpSize16, Sched<[WriteBSF]>;
255-
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
255+
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
256256
"bsf{w}\t{$src, $dst|$dst, $src}",
257-
[(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
257+
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, (loadi16 addr:$src)))]>,
258258
TB, OpSize16, Sched<[WriteBSFLd]>;
259-
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
259+
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
260260
"bsf{l}\t{$src, $dst|$dst, $src}",
261-
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
261+
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, GR32:$src))]>,
262262
TB, OpSize32, Sched<[WriteBSF]>;
263-
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
263+
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
264264
"bsf{l}\t{$src, $dst|$dst, $src}",
265-
[(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
265+
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, (loadi32 addr:$src)))]>,
266266
TB, OpSize32, Sched<[WriteBSFLd]>;
267-
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
267+
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
268268
"bsf{q}\t{$src, $dst|$dst, $src}",
269-
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
269+
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, GR64:$src))]>,
270270
TB, Sched<[WriteBSF]>;
271-
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
271+
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
272272
"bsf{q}\t{$src, $dst|$dst, $src}",
273-
[(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
273+
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, (loadi64 addr:$src)))]>,
274274
TB, Sched<[WriteBSFLd]>;
275275

276-
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
276+
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
277277
"bsr{w}\t{$src, $dst|$dst, $src}",
278-
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
278+
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, GR16:$src))]>,
279279
TB, OpSize16, Sched<[WriteBSR]>;
280-
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
280+
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
281281
"bsr{w}\t{$src, $dst|$dst, $src}",
282-
[(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
282+
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, (loadi16 addr:$src)))]>,
283283
TB, OpSize16, Sched<[WriteBSRLd]>;
284-
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
284+
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
285285
"bsr{l}\t{$src, $dst|$dst, $src}",
286-
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
286+
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, GR32:$src))]>,
287287
TB, OpSize32, Sched<[WriteBSR]>;
288-
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
288+
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
289289
"bsr{l}\t{$src, $dst|$dst, $src}",
290-
[(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
290+
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, (loadi32 addr:$src)))]>,
291291
TB, OpSize32, Sched<[WriteBSRLd]>;
292-
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
292+
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
293293
"bsr{q}\t{$src, $dst|$dst, $src}",
294-
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
294+
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, GR64:$src))]>,
295295
TB, Sched<[WriteBSR]>;
296-
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
296+
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
297297
"bsr{q}\t{$src, $dst|$dst, $src}",
298-
[(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
298+
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, (loadi64 addr:$src)))]>,
299299
TB, Sched<[WriteBSRLd]>;
300300
} // Defs = [EFLAGS]
301301

llvm/lib/Target/X86/X86Subtarget.h

+5
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
263263
return hasBWI() && useAVX512Regs();
264264
}
265265

266+
// Returns true if the destination register of a BSF/BSR instruction is
267+
// not touched if the source register is zero.
268+
// NOTE: i32->i64 implicit zext isn't guaranteed by BSR/BSF pass through.
269+
bool hasBitScanPassThrough() const { return is64Bit(); }
270+
266271
bool isXRaySupported() const override { return is64Bit(); }
267272

268273
/// Use clflush if we have SSE2 or we're on x86-64 (even if we asked for

llvm/test/CodeGen/X86/bit_ceil.ll

+4-8
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ define i32 @bit_ceil_i32(i32 %x) {
1010
; NOBMI: # %bb.0:
1111
; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
1212
; NOBMI-NEXT: leal -1(%rdi), %eax
13-
; NOBMI-NEXT: bsrl %eax, %eax
1413
; NOBMI-NEXT: movl $63, %ecx
15-
; NOBMI-NEXT: cmovnel %eax, %ecx
14+
; NOBMI-NEXT: bsrl %eax, %ecx
1615
; NOBMI-NEXT: xorl $31, %ecx
1716
; NOBMI-NEXT: negb %cl
1817
; NOBMI-NEXT: movl $1, %edx
@@ -47,9 +46,8 @@ define i32 @bit_ceil_i32(i32 %x) {
4746
define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
4847
; NOBMI-LABEL: bit_ceil_i32_plus1:
4948
; NOBMI: # %bb.0: # %entry
50-
; NOBMI-NEXT: bsrl %edi, %eax
5149
; NOBMI-NEXT: movl $63, %ecx
52-
; NOBMI-NEXT: cmovnel %eax, %ecx
50+
; NOBMI-NEXT: bsrl %edi, %ecx
5351
; NOBMI-NEXT: xorl $31, %ecx
5452
; NOBMI-NEXT: negb %cl
5553
; NOBMI-NEXT: movl $1, %edx
@@ -86,9 +84,8 @@ define i64 @bit_ceil_i64(i64 %x) {
8684
; NOBMI-LABEL: bit_ceil_i64:
8785
; NOBMI: # %bb.0:
8886
; NOBMI-NEXT: leaq -1(%rdi), %rax
89-
; NOBMI-NEXT: bsrq %rax, %rax
9087
; NOBMI-NEXT: movl $127, %ecx
91-
; NOBMI-NEXT: cmovneq %rax, %rcx
88+
; NOBMI-NEXT: bsrq %rax, %rcx
9289
; NOBMI-NEXT: xorl $63, %ecx
9390
; NOBMI-NEXT: negb %cl
9491
; NOBMI-NEXT: movl $1, %edx
@@ -122,9 +119,8 @@ define i64 @bit_ceil_i64(i64 %x) {
122119
define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
123120
; NOBMI-LABEL: bit_ceil_i64_plus1:
124121
; NOBMI: # %bb.0: # %entry
125-
; NOBMI-NEXT: bsrq %rdi, %rax
126122
; NOBMI-NEXT: movl $127, %ecx
127-
; NOBMI-NEXT: cmovneq %rax, %rcx
123+
; NOBMI-NEXT: bsrq %rdi, %rcx
128124
; NOBMI-NEXT: xorl $63, %ecx
129125
; NOBMI-NEXT: negb %cl
130126
; NOBMI-NEXT: movl $1, %edx

0 commit comments

Comments
 (0)