Skip to content

[X86] Handle BSF/BSR "zero-input pass through" behaviour #123623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 51 additions & 13 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
if (Subtarget.is64Bit()) {
setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
}
Expand Down Expand Up @@ -3386,15 +3385,19 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
}

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
// Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
// Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
// i32/i64 or can rely on BSF passthrough value.
return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
Subtarget.hasBitScanPassThrough() ||
(!Ty->isVectorTy() &&
Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
}

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
// Speculate ctlz only if we can directly use LZCNT.
return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
// Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
// passthrough value.
return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
Subtarget.hasBitScanPassThrough();
}

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
Expand Down Expand Up @@ -28694,11 +28697,18 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
}

// Check if we can safely pass a result though BSR for zero sources.
SDValue PassThru = DAG.getUNDEF(OpVT);
if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
!DAG.isKnownNeverZero(Op))
PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);

if (Opc == ISD::CTLZ) {
// Skip CMOV if we're using a pass through value.
if (Opc == ISD::CTLZ && PassThru.isUndef()) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
Expand All @@ -28721,16 +28731,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumBits = VT.getScalarSizeInBits();
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
bool NonZeroSrc = DAG.isKnownNeverZero(N0);

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering");

// Check if we can safely pass a result though BSF for zero sources.
SDValue PassThru = DAG.getUNDEF(VT);
if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
PassThru = DAG.getConstant(NumBits, dl, VT);

// Issue a bsf (scan bits forward) which also sets EFLAGS.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);

// If src is known never zero we can skip the CMOV.
if (DAG.isKnownNeverZero(N0))
// Skip CMOV if src is never zero or we're using a pass through value.
if (NonZeroSrc || !PassThru.isUndef())
return Op;

// If src is zero (i.e. bsf sets ZF), returns NumBits.
Expand Down Expand Up @@ -38193,12 +38209,34 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = KnownBits::mul(Known, Known2);
break;
}
case X86ISD::BSR:
// BSR(0) is undef, but any use of BSR already accounts for non-zero inputs.
// Similar KnownBits behaviour to CTLZ_ZERO_UNDEF.
case X86ISD::BSF: {
Known.Zero.setBitsFrom(Log2_32(BitWidth));

KnownBits Known2;
Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
if (Known2.isNonZero()) {
// If we have a known 1, its position is our upper bound.
unsigned PossibleTZ = Known2.countMaxTrailingZeros();
unsigned LowBits = llvm::bit_width(PossibleTZ);
Known.Zero.setBitsFrom(LowBits);
} else if (!Op.getOperand(0).isUndef()) {
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known = Known.intersectWith(Known2);
}
break;
}
case X86ISD::BSR: {
// TODO: Bound with input known bits?
Known.Zero.setBitsFrom(Log2_32(BitWidth));

if (!Op.getOperand(0).isUndef() &&
!DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
KnownBits Known2;
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
Known = Known.intersectWith(Known2);
}
break;
}
case X86ISD::SETCC:
Known.Zero.setBitsFrom(1);
break;
Expand Down Expand Up @@ -54243,7 +54281,7 @@ static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
}

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
if (VT == MVT::i8)
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);

Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Original file line number Diff line number Diff line change
Expand Up @@ -2213,12 +2213,12 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
(IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;

// Bit scan instruction patterns to match explicit zero-undef behavior.
def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr (i16 (IMPLICIT_DEF)), GR16:$src)>;
def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr (i32 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr (i64 (IMPLICIT_DEF)), GR64:$src)>;
def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm (i16 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm (i32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm (i64 (IMPLICIT_DEF)), addr:$src)>;

// When HasMOVBE is enabled it is possible to get a non-legalized
// register-register 16 bit bswap. This maps it to a ROL instruction.
Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Target/X86/X86InstrFragments.td
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def SDTX86Cmpccxadd : SDTypeProfile<1, 4, [SDTCisSameAs<0, 2>,
def X86MFence : SDNode<"X86ISD::MFENCE", SDTNone, [SDNPHasChain]>;


def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
def X86bsf : SDNode<"X86ISD::BSF", SDTBinaryArithWithFlags>;
def X86bsr : SDNode<"X86ISD::BSR", SDTBinaryArithWithFlags>;
def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;

Expand Down Expand Up @@ -685,16 +685,19 @@ def anyext_sdiv : PatFrag<(ops node:$lhs), (anyext node:$lhs),[{
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. AssertSext/AssertZext/AssertAlign aren't saying
// anything about the upper 32 bits, they're probably just qualifying a
// CopyFromReg. FREEZE may be coming from a a truncate. Any other 32-bit
// operation will zero-extend up to 64 bits.
// CopyFromReg. FREEZE may be coming from a a truncate. BitScan fall through
// values may not zero the upper bits correctly.
// Any other 32-bit operation will zero-extend up to 64 bits.
def def32 : PatLeaf<(i32 GR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
N->getOpcode() != ISD::CopyFromReg &&
N->getOpcode() != ISD::AssertSext &&
N->getOpcode() != ISD::AssertZext &&
N->getOpcode() != ISD::AssertAlign &&
N->getOpcode() != ISD::FREEZE;
N->getOpcode() != ISD::FREEZE &&
!((N->getOpcode() == X86ISD::BSF || N->getOpcode() == X86ISD::BSR) &&
(!N->getOperand(0).isUndef() && !isa<ConstantSDNode>(N->getOperand(0))));
}]>;

// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
Expand Down
26 changes: 14 additions & 12 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5220,42 +5220,43 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
}

/// Check whether the use can be converted to remove a comparison against zero.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we have a comment for the returned pair?

static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
/// Returns the EFLAGS condition and the operand that we are comparing against zero.
static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return X86::COND_INVALID;
return std::make_pair(X86::COND_INVALID, ~0U);
CASE_ND(NEG8r)
CASE_ND(NEG16r)
CASE_ND(NEG32r)
CASE_ND(NEG64r)
return X86::COND_AE;
return std::make_pair(X86::COND_AE, 1U);
case X86::LZCNT16rr:
case X86::LZCNT32rr:
case X86::LZCNT64rr:
return X86::COND_B;
return std::make_pair(X86::COND_B, 1U);
case X86::POPCNT16rr:
case X86::POPCNT32rr:
case X86::POPCNT64rr:
return X86::COND_E;
return std::make_pair(X86::COND_E, 1U);
case X86::TZCNT16rr:
case X86::TZCNT32rr:
case X86::TZCNT64rr:
return X86::COND_B;
return std::make_pair(X86::COND_B, 1U);
case X86::BSF16rr:
case X86::BSF32rr:
case X86::BSF64rr:
case X86::BSR16rr:
case X86::BSR32rr:
case X86::BSR64rr:
return X86::COND_E;
return std::make_pair(X86::COND_E, 2U);
case X86::BLSI32rr:
case X86::BLSI64rr:
return X86::COND_AE;
return std::make_pair(X86::COND_AE, 1U);
case X86::BLSR32rr:
case X86::BLSR64rr:
case X86::BLSMSK32rr:
case X86::BLSMSK64rr:
return X86::COND_B;
return std::make_pair(X86::COND_B, 1U);
// TODO: TBM instructions.
}
}
Expand Down Expand Up @@ -5336,6 +5337,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
bool ClearsOverflowFlag = false;
bool ShouldUpdateCC = false;
bool IsSwapped = false;
unsigned OpNo = 0;
X86::CondCode NewCC = X86::COND_INVALID;
int64_t ImmDelta = 0;

Expand Down Expand Up @@ -5391,9 +5393,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// ... // EFLAGS not changed
// testl %eax, %eax // <-- can be removed
if (IsCmpZero) {
NewCC = isUseDefConvertible(Inst);
if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
Inst.getOperand(1).getReg() == SrcReg) {
std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
Inst.getOperand(OpNo).getReg() == SrcReg) {
Comment on lines +5396 to +5398
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't put PassThru in the last operand? We don't need to change it then.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is at the MI level, and the passthrough/fallback operand is tied to the destination reg (like any x86 binop) - it'd be weird to have a commutation and I expect could lead to further problems.

ShouldUpdateCC = true;
MI = &Inst;
break;
Expand Down
50 changes: 25 additions & 25 deletions llvm/lib/Target/X86/X86InstrMisc.td
Original file line number Diff line number Diff line change
Expand Up @@ -247,55 +247,55 @@ def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
} // Constraints = "$src = $dst", SchedRW

// Bit scan instructions.
let Defs = [EFLAGS] in {
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
let Defs = [EFLAGS], Constraints = "$fallback = $dst" in {
def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, GR16:$src))]>,
TB, OpSize16, Sched<[WriteBSF]>;
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
"bsf{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
[(set GR16:$dst, EFLAGS, (X86bsf GR16:$fallback, (loadi16 addr:$src)))]>,
TB, OpSize16, Sched<[WriteBSFLd]>;
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, GR32:$src))]>,
TB, OpSize32, Sched<[WriteBSF]>;
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
"bsf{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
[(set GR32:$dst, EFLAGS, (X86bsf GR32:$fallback, (loadi32 addr:$src)))]>,
TB, OpSize32, Sched<[WriteBSFLd]>;
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, GR64:$src))]>,
TB, Sched<[WriteBSF]>;
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
"bsf{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
[(set GR64:$dst, EFLAGS, (X86bsf GR64:$fallback, (loadi64 addr:$src)))]>,
TB, Sched<[WriteBSFLd]>;

def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$fallback, GR16:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, GR16:$src))]>,
TB, OpSize16, Sched<[WriteBSR]>;
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins GR16:$fallback, i16mem:$src),
"bsr{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
[(set GR16:$dst, EFLAGS, (X86bsr GR16:$fallback, (loadi16 addr:$src)))]>,
TB, OpSize16, Sched<[WriteBSRLd]>;
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$fallback, GR32:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, GR32:$src))]>,
TB, OpSize32, Sched<[WriteBSR]>;
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins GR32:$fallback, i32mem:$src),
"bsr{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
[(set GR32:$dst, EFLAGS, (X86bsr GR32:$fallback, (loadi32 addr:$src)))]>,
TB, OpSize32, Sched<[WriteBSRLd]>;
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$fallback, GR64:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, GR64:$src))]>,
TB, Sched<[WriteBSR]>;
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins GR64:$fallback, i64mem:$src),
"bsr{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
[(set GR64:$dst, EFLAGS, (X86bsr GR64:$fallback, (loadi64 addr:$src)))]>,
TB, Sched<[WriteBSRLd]>;
} // Defs = [EFLAGS]

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,11 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return hasBWI() && useAVX512Regs();
}

// Returns true if the destination register of a BSF/BSR instruction is
// not touched if the source register is zero.
// NOTE: i32->i64 implicit zext isn't guaranteed by BSR/BSF pass through.
bool hasBitScanPassThrough() const { return is64Bit(); }

bool isXRaySupported() const override { return is64Bit(); }

/// Use clflush if we have SSE2 or we're on x86-64 (even if we asked for
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/X86/bit_ceil.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ define i32 @bit_ceil_i32(i32 %x) {
; NOBMI: # %bb.0:
; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
; NOBMI-NEXT: leal -1(%rdi), %eax
; NOBMI-NEXT: bsrl %eax, %eax
; NOBMI-NEXT: movl $63, %ecx
; NOBMI-NEXT: cmovnel %eax, %ecx
; NOBMI-NEXT: bsrl %eax, %ecx
; NOBMI-NEXT: xorl $31, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
Expand Down Expand Up @@ -47,9 +46,8 @@ define i32 @bit_ceil_i32(i32 %x) {
define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
; NOBMI-LABEL: bit_ceil_i32_plus1:
; NOBMI: # %bb.0: # %entry
; NOBMI-NEXT: bsrl %edi, %eax
; NOBMI-NEXT: movl $63, %ecx
; NOBMI-NEXT: cmovnel %eax, %ecx
; NOBMI-NEXT: bsrl %edi, %ecx
; NOBMI-NEXT: xorl $31, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
Expand Down Expand Up @@ -86,9 +84,8 @@ define i64 @bit_ceil_i64(i64 %x) {
; NOBMI-LABEL: bit_ceil_i64:
; NOBMI: # %bb.0:
; NOBMI-NEXT: leaq -1(%rdi), %rax
; NOBMI-NEXT: bsrq %rax, %rax
; NOBMI-NEXT: movl $127, %ecx
; NOBMI-NEXT: cmovneq %rax, %rcx
; NOBMI-NEXT: bsrq %rax, %rcx
; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
Expand Down Expand Up @@ -122,9 +119,8 @@ define i64 @bit_ceil_i64(i64 %x) {
define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
; NOBMI-LABEL: bit_ceil_i64_plus1:
; NOBMI: # %bb.0: # %entry
; NOBMI-NEXT: bsrq %rdi, %rax
; NOBMI-NEXT: movl $127, %ecx
; NOBMI-NEXT: cmovneq %rax, %rcx
; NOBMI-NEXT: bsrq %rdi, %rcx
; NOBMI-NEXT: xorl $63, %ecx
; NOBMI-NEXT: negb %cl
; NOBMI-NEXT: movl $1, %edx
Expand Down
Loading
Loading