diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 64c1b5df6d582..27d6bc158b3c3 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -63,6 +63,12 @@ Changes to the LLVM IR * ``llvm.nvvm.bitcast.d2ll`` * ``llvm.nvvm.bitcast.ll2d`` +* Remove the following intrinsics which can be replaced with a funnel-shift: + + * ``llvm.nvvm.rotate.b32`` + * ``llvm.nvvm.rotate.right.b64`` + * ``llvm.nvvm.rotate.b64`` + Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 737dd6092e218..aa5294f5f9c90 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4479,22 +4479,6 @@ def int_nvvm_sust_p_3d_v4i32_trap "llvm.nvvm.sust.p.3d.v4i32.trap">, ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">; - -def int_nvvm_rotate_b32 - : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">, - ClangBuiltin<"__nvvm_rotate_b32">; - -def int_nvvm_rotate_b64 - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">, - ClangBuiltin<"__nvvm_rotate_b64">; - -def int_nvvm_rotate_right_b64 - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">, - ClangBuiltin<"__nvvm_rotate_right_b64">; - def int_nvvm_swap_lo_hi_b64 : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 02d1d9d9f7898..3390d651d6c69 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1272,6 +1272,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // nvvm.bitcast.{f2i,i2f,ll2d,d2ll} Expand = Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll"; + else if (Name.consume_front("rotate.")) + // nvvm.rotate.{b32,b64,right.b64} + Expand = Name == "b32" || Name == "b64" || Name == "right.b64"; else Expand = false; @@ -2258,6 +2261,108 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) { } } +static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, + Function *F, IRBuilder<> &Builder) { + Value *Rep = nullptr; + + if (Name == "abs.i" || Name == "abs.ll") { + Value *Arg = CI->getArgOperand(0); + Value *Neg = Builder.CreateNeg(Arg, "neg"); + Value *Cmp = Builder.CreateICmpSGE( + Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); + Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); + } else if (Name.starts_with("atomic.load.add.f32.p") || + Name.starts_with("atomic.load.add.f64.p")) { + Value *Ptr = CI->getArgOperand(0); + Value *Val = CI->getArgOperand(1); + Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(), + AtomicOrdering::SequentiallyConsistent); + } else if (Name.consume_front("max.") && + (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || + Name == "ui" || Name == "ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.starts_with("u") + ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond") + : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max"); + } else if (Name.consume_front("min.") && + (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || + Name == "ui" || Name == "ull")) { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Cmp = Name.starts_with("u") + ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond") + : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond"); + Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min"); + } else if (Name == "clz.ll") { + // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. + Value *Arg = CI->getArgOperand(0); + Value *Ctlz = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, + {Arg->getType()}), + {Arg, Builder.getFalse()}, "ctlz"); + Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); + } else if (Name == "popc.ll") { + // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an + // i64. + Value *Arg = CI->getArgOperand(0); + Value *Popc = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, + {Arg->getType()}), + Arg, "ctpop"); + Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); + } else if (Name == "h2f") { + Rep = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}), + CI->getArgOperand(0), "h2f"); + } else if (Name.consume_front("bitcast.") && + (Name == "f2i" || Name == "i2f" || Name == "ll2d" || + Name == "d2ll")) { + Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType()); + } else if (Name == "rotate.b32") { + Value *Arg = CI->getOperand(0); + Value *ShiftAmt = CI->getOperand(1); + Rep = Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::fshl, + {Arg, Arg, ShiftAmt}); + } else if (Name == "rotate.b64") { + Type *Int64Ty = Builder.getInt64Ty(); + Value *Arg = CI->getOperand(0); + Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); + Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl, + {Arg, Arg, ZExtShiftAmt}); + } else if (Name == "rotate.right.b64") { + Type *Int64Ty = Builder.getInt64Ty(); + Value *Arg = CI->getOperand(0); + Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty); + Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr, + {Arg, Arg, ZExtShiftAmt}); + } else { + Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); + if (IID != Intrinsic::not_intrinsic && + !F->getReturnType()->getScalarType()->isBFloatTy()) { + rename(F); + Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + SmallVector Args; + for (size_t I = 0; I < NewFn->arg_size(); ++I) { + Value *Arg = CI->getArgOperand(I); + Type *OldType = Arg->getType(); + Type *NewType = NewFn->getArg(I)->getType(); + Args.push_back( + (OldType->isIntegerTy() && NewType->getScalarType()->isBFloatTy()) + ? Builder.CreateBitCast(Arg, NewType) + : Arg); + } + Rep = Builder.CreateCall(NewFn, Args); + if (F->getReturnType()->isIntegerTy()) + Rep = Builder.CreateBitCast(Rep, F->getReturnType()); + } + } + + return Rep; +} + static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IRBuilder<> &Builder) { LLVMContext &C = F->getContext(); @@ -4208,85 +4313,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { if (!IsX86 && Name == "stackprotectorcheck") { Rep = nullptr; - } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { - Value *Arg = CI->getArgOperand(0); - Value *Neg = Builder.CreateNeg(Arg, "neg"); - Value *Cmp = Builder.CreateICmpSGE( - Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); - Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); - } else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") || - Name.starts_with("atomic.load.add.f64.p"))) { - Value *Ptr = CI->getArgOperand(0); - Value *Val = CI->getArgOperand(1); - Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(), - AtomicOrdering::SequentiallyConsistent); - } else if (IsNVVM && Name.consume_front("max.") && - (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || - Name == "ui" || Name == "ull")) { - Value *Arg0 = CI->getArgOperand(0); - Value *Arg1 = CI->getArgOperand(1); - Value *Cmp = Name.starts_with("u") - ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond") - : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond"); - Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max"); - } else if (IsNVVM && Name.consume_front("min.") && - (Name == "s" || Name == "i" || Name == "ll" || Name == "us" || - Name == "ui" || Name == "ull")) { - Value *Arg0 = CI->getArgOperand(0); - Value *Arg1 = CI->getArgOperand(1); - Value *Cmp = Name.starts_with("u") - ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond") - : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond"); - Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min"); - } else if (IsNVVM && Name == "clz.ll") { - // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. - Value *Arg = CI->getArgOperand(0); - Value *Ctlz = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - {Arg->getType()}), - {Arg, Builder.getFalse()}, "ctlz"); - Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); - } else if (IsNVVM && Name == "popc.ll") { - // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an - // i64. - Value *Arg = CI->getArgOperand(0); - Value *Popc = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, - {Arg->getType()}), - Arg, "ctpop"); - Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); } else if (IsNVVM) { - if (Name == "h2f") { - Rep = - Builder.CreateCall(Intrinsic::getDeclaration( - F->getParent(), Intrinsic::convert_from_fp16, - {Builder.getFloatTy()}), - CI->getArgOperand(0), "h2f"); - } else if (Name.consume_front("bitcast.") && - (Name == "f2i" || Name == "i2f" || Name == "ll2d" || - Name == "d2ll")) { - Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType()); - } else { - Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name); - if (IID != Intrinsic::not_intrinsic && - !F->getReturnType()->getScalarType()->isBFloatTy()) { - rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); - SmallVector Args; - for (size_t I = 0; I < NewFn->arg_size(); ++I) { - Value *Arg = CI->getArgOperand(I); - Type *OldType = Arg->getType(); - Type *NewType = NewFn->getArg(I)->getType(); - Args.push_back((OldType->isIntegerTy() && - NewType->getScalarType()->isBFloatTy()) - ? Builder.CreateBitCast(Arg, NewType) - : Arg); - } - Rep = Builder.CreateCall(NewFn, Args); - if (F->getReturnType()->isIntegerTy()) - Rep = Builder.CreateBitCast(Rep, F->getReturnType()); - } - } + Rep = upgradeNVVMIntrinsicCall(Name, CI, F, Builder); } else if (IsX86) { Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder); } else if (IsARM) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2688834221091..8812136733fb2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -594,20 +594,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); - // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs - // that don't have h/w rotation we lower them to multi-instruction assembly. - // See ROT*_sw in NVPTXIntrInfo.td - setOperationAction(ISD::ROTL, MVT::i64, Legal); - setOperationAction(ISD::ROTR, MVT::i64, Legal); - setOperationAction(ISD::ROTL, MVT::i32, Legal); - setOperationAction(ISD::ROTR, MVT::i32, Legal); - - setOperationAction(ISD::ROTL, MVT::i16, Expand); - setOperationAction(ISD::ROTL, MVT::v2i16, Expand); - setOperationAction(ISD::ROTR, MVT::i16, Expand); - setOperationAction(ISD::ROTR, MVT::v2i16, Expand); - setOperationAction(ISD::ROTL, MVT::i8, Expand); - setOperationAction(ISD::ROTR, MVT::i8, Expand); + setOperationAction({ISD::ROTL, ISD::ROTR}, + {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}, + Expand); + + if (STI.hasHWROT32()) + setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal); + setOperationAction(ISD::BSWAP, MVT::i16, Expand); setOperationAction(ISD::BR_JT, MVT::Other, Custom); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 510e4b8100311..f6bbf4c2ffc02 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1665,167 +1665,6 @@ def BREV64 : "brev.b64 \t$dst, $a;", [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>; -// -// Rotate: Use ptx shf instruction if available. -// - -// 32 bit r2 = rotl r1, n -// => -// r2 = shf.l r1, r1, n -def ROTL32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTL32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[hasHWROT32]>; - -// 32 bit r2 = rotr r1, n -// => -// r2 = shf.r r1, r1, n -def ROTR32imm_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>, - Requires<[hasHWROT32]>; - -def ROTR32reg_hw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[hasHWROT32]>; - -// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1. -def ROT32imm_sw : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - "shl.b32 \t%lhs, $src, $amt1;\n\t" - "shr.b32 \t%rhs, $src, $amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_32 : SDNodeXFormgetTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, - Requires<[noHWROT32]>; -def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>, - Requires<[noHWROT32]>; - -// 32-bit software rotate left by register. -def ROTL32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shl.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shr.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[noHWROT32]>; - -// 32-bit software rotate right by register. -def ROTR32reg_sw : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b32 %lhs;\n\t" - ".reg .b32 %rhs;\n\t" - ".reg .b32 %amt2;\n\t" - "shr.b32 \t%lhs, $src, $amt;\n\t" - "sub.s32 \t%amt2, 32, $amt;\n\t" - "shl.b32 \t%rhs, $src, %amt2;\n\t" - "add.u32 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>, - Requires<[noHWROT32]>; - -// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1. -def ROT64imm_sw : - NVPTXInst<(outs Int64Regs:$dst), - (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - "shl.b64 \t%lhs, $src, $amt1;\n\t" - "shr.b64 \t%rhs, $src, $amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - []>; - -def SUB_FRM_64 : SDNodeXFormgetTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>; -def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>; - -// 64-bit software rotate left by register. -def ROTL64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "and.b32 \t%amt2, $amt, 63;\n\t" - "shl.b64 \t%lhs, $src, %amt2;\n\t" - "sub.u32 \t%amt2, 64, %amt2;\n\t" - "shr.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>; - -def ROTR64reg_sw : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt), - "{{\n\t" - ".reg .b64 %lhs;\n\t" - ".reg .b64 %rhs;\n\t" - ".reg .u32 %amt2;\n\t" - "and.b32 \t%amt2, $amt, 63;\n\t" - "shr.b64 \t%lhs, $src, %amt2;\n\t" - "sub.u32 \t%amt2, 64, %amt2;\n\t" - "shl.b64 \t%rhs, $src, %amt2;\n\t" - "add.u64 \t$dst, %lhs, %rhs;\n\t" - "}}", - [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>; - -// -// Funnnel shift in clamp mode -// - -// Create SDNodes so they can be used in the DAG code, e.g. -// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) -def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; -def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; - -def FUNSHFLCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; - -def FUNSHFRCLAMP : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;", - [(set Int32Regs:$dst, - (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>; // // BFE - bit-field extract @@ -3657,6 +3496,42 @@ def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))), (CVT_u32_u16 Int16Regs:$a, CvtNONE)>; +// +// Funnel-Shift +// + +// Create SDNodes so they can be used in the DAG code, e.g. +// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts) +def fshl_clamp : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>; +def fshr_clamp : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>; + +// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so +// no side effects. +let hasSideEffects = false in { + multiclass ShfInst { + def _i + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), + "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt)))]>, + Requires<[hasHWROT32]>; + + def _r + : NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), + "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;", + [(set Int32Regs:$dst, + (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>, + Requires<[hasHWROT32]>; + } + + defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>; + defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>; + defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>; + defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>; +} + // Count leading zeros let hasSideEffects = false in { def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 56c551661151d..2688cfbe5e824 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2733,134 +2733,9 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>; def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>; -// rotate builtin support - -def ROTATE_B32_HW_IMM - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, - (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>, - Requires<[hasHWROT32]> ; - -def ROTATE_B32_HW_REG - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$src, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $src, $src, $amt;", - [(set Int32Regs:$dst, - (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>, - Requires<[hasHWROT32]> ; - -def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)), - (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>, - Requires<[noHWROT32]> ; - -def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt), - (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>, - Requires<[noHWROT32]> ; - -let hasSideEffects = false in { - def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), - !strconcat("{{\n\t", - ".reg .b32 %dummy;\n\t", - "mov.b64 \t{$dst,%dummy}, $src;\n\t", - "}}"), - []> ; - - def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src), - !strconcat("{{\n\t", - ".reg .b32 %dummy;\n\t", - "mov.b64 \t{%dummy,$dst}, $src;\n\t", - "}}"), - []> ; -} - -let hasSideEffects = false in { - def PACK_TWO_INT32 - : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi), - "mov.b64 \t$dst, {{$lo, $hi}};", []> ; -} - def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src), - (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src), - (GET_LO_INT64 Int64Regs:$src))> ; - -// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so -// no side effects. -let hasSideEffects = false in { - def SHF_L_WRAP_B32_IMM - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), - "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; - - def SHF_L_WRAP_B32_REG - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; - - def SHF_R_WRAP_B32_IMM - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt), - "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; - - def SHF_R_WRAP_B32_REG - : NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt), - "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>, - Requires<[hasHWROT32]>; -} - -// HW version of rotate 64 -def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), - (PACK_TWO_INT32 - (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), - (GET_LO_INT64 Int64Regs:$src), imm:$amt), - (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), - (GET_HI_INT64 Int64Regs:$src), imm:$amt))>, - Requires<[hasHWROT32]>; - -def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), - (PACK_TWO_INT32 - (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), - (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt), - (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), - (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>, - Requires<[hasHWROT32]>; - - -def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), - (PACK_TWO_INT32 - (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src), - (GET_HI_INT64 Int64Regs:$src), imm:$amt), - (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src), - (GET_LO_INT64 Int64Regs:$src), imm:$amt))>, - Requires<[hasHWROT32]>; - -def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), - (PACK_TWO_INT32 - (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src), - (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt), - (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src), - (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>, - Requires<[hasHWROT32]>; - -// SW version of rotate 64 -def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>, - Requires<[noHWROT32]>; -def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt), - (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>, - Requires<[noHWROT32]>; -def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)), - (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>, - Requires<[noHWROT32]>; -def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt), - (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>, - Requires<[noHWROT32]>; - + (V2I32toI64 (I64toI32H Int64Regs:$src), + (I64toI32L Int64Regs:$src))> ; //----------------------------------- // Texture Intrinsics diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index 7e4a4d527fc90..43ac246055da7 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -31,6 +31,10 @@ declare float @llvm.nvvm.bitcast.i2f(i32) declare i64 @llvm.nvvm.bitcast.d2ll(double) declare double @llvm.nvvm.bitcast.ll2d(i64) +declare i32 @llvm.nvvm.rotate.b32(i32, i32) +declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) +declare i64 @llvm.nvvm.rotate.b64(i64, i32) + ; CHECK-LABEL: @simple_upgrade define void @simple_upgrade(i32 %a, i64 %b, i16 %c) { ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a) @@ -139,4 +143,16 @@ define void @bitcast(i32 %a, i64 %b, float %c, double %d) { %r4 = call double @llvm.nvvm.bitcast.ll2d(i64 %b) ret void -} \ No newline at end of file +} + +; CHECK-LABEL: @rotate +define void @rotate(i32 %a, i64 %b) { +; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 6) +; CHECK: call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 7) +; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 8) +; + %r1 = call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 6) + %r2 = call i64 @llvm.nvvm.rotate.right.b64(i64 %b, i32 7) + %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 20c7ae5908d29..9ec5bcd13403b 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -9,26 +9,29 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32) declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) +declare i64 @llvm.fshl.i64(i64, i64, i64) +declare i64 @llvm.fshr.i64(i64, i64, i64) +declare i32 @llvm.fshl.i32(i32, i32, i32) +declare i32 @llvm.fshr.i32(i32, i32, i32) + + ; SM20: rotate32 ; SM35: rotate32 define i32 @rotate32(i32 %a, i32 %b) { ; SM20-LABEL: rotate32( ; SM20: { -; SM20-NEXT: .reg .b32 %r<4>; +; SM20-NEXT: .reg .b32 %r<9>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; ; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b32 %lhs; -; SM20-NEXT: .reg .b32 %rhs; -; SM20-NEXT: .reg .b32 %amt2; -; SM20-NEXT: shl.b32 %lhs, %r1, %r2; -; SM20-NEXT: sub.s32 %amt2, 32, %r2; -; SM20-NEXT: shr.b32 %rhs, %r1, %amt2; -; SM20-NEXT: add.u32 %r3, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: shl.b32 %r4, %r1, %r3; +; SM20-NEXT: neg.s32 %r5, %r2; +; SM20-NEXT: and.b32 %r6, %r5, 31; +; SM20-NEXT: shr.u32 %r7, %r1, %r6; +; SM20-NEXT: or.b32 %r8, %r4, %r7; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r8; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate32( @@ -50,45 +53,36 @@ define i32 @rotate32(i32 %a, i32 %b) { define i64 @rotate64(i64 %a, i32 %b) { ; SM20-LABEL: rotate64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<6>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1]; -; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3; -; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) ret i64 %val @@ -99,45 +93,36 @@ define i64 @rotate64(i64 %a, i32 %b) { define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-LABEL: rotateright64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotateright64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<6>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1]; -; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3; -; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) ret i64 %val @@ -148,18 +133,14 @@ define i64 @rotateright64(i64 %a, i32 %b) { define i32 @rotl0(i32 %x) { ; SM20-LABEL: rotl0( ; SM20: { -; SM20-NEXT: .reg .b32 %r<3>; +; SM20-NEXT: .reg .b32 %r<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b32 %lhs; -; SM20-NEXT: .reg .b32 %rhs; -; SM20-NEXT: shl.b32 %lhs, %r1, 8; -; SM20-NEXT: shr.b32 %rhs, %r1, 24; -; SM20-NEXT: add.u32 %r2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM20-NEXT: shr.u32 %r2, %r1, 24; +; SM20-NEXT: shl.b32 %r3, %r1, 8; +; SM20-NEXT: or.b32 %r4, %r3, %r2; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl0( @@ -177,51 +158,40 @@ define i32 @rotl0(i32 %x) { ret i32 %t2 } -declare i64 @llvm.fshl.i64(i64, i64, i64) -declare i64 @llvm.fshr.i64(i64, i64, i64) - ; SM35: rotl64 define i64 @rotl64(i64 %a, i64 %n) { ; SM20-LABEL: rotl64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<2>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: .reg .u32 %amt2; -; SM35-NEXT: and.b32 %amt2, %r1, 63; -; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM35-NEXT: sub.u32 %amt2, 64, %amt2; -; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -231,34 +201,26 @@ define i64 @rotl64(i64 %a, i64 %n) { define i64 @rotl64_imm(i64 %a) { ; SM20-LABEL: rotl64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: shl.b64 %lhs, %rd1, 2; -; SM20-NEXT: shr.b64 %rhs, %rd1, 62; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: shr.u64 %rd2, %rd1, 62; +; SM20-NEXT: shl.b64 %rd3, %rd1, 2; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: shl.b64 %lhs, %rd1, 2; -; SM35-NEXT: shr.b64 %rhs, %rd1, 62; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: shr.u64 %rd2, %rd1, 62; +; SM35-NEXT: shl.b64 %rd3, %rd1, 2; +; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) ret i64 %val @@ -268,44 +230,36 @@ define i64 @rotl64_imm(i64 %a) { define i64 @rotr64(i64 %a, i64 %n) { ; SM20-LABEL: rotr64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<2>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: .reg .u32 %amt2; -; SM35-NEXT: and.b32 %amt2, %r1, 63; -; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM35-NEXT: sub.u32 %amt2, 64, %amt2; -; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -315,35 +269,180 @@ define i64 @rotr64(i64 %a, i64 %n) { define i64 @rotr64_imm(i64 %a) { ; SM20-LABEL: rotr64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: shl.b64 %lhs, %rd1, 62; -; SM20-NEXT: shr.b64 %rhs, %rd1, 2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: shl.b64 %rd2, %rd1, 62; +; SM20-NEXT: shr.u64 %rd3, %rd1, 2; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: shl.b64 %lhs, %rd1, 62; -; SM35-NEXT: shr.b64 %rhs, %rd1, 2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: shl.b64 %rd2, %rd1, 62; +; SM35-NEXT: shr.u64 %rd3, %rd1, 2; +; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) ret i64 %val } + +define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { +; SM20-LABEL: funnel_shift_right_32( +; SM20: { +; SM20-NEXT: .reg .b32 %r<11>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; +; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2]; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1]; +; SM20-NEXT: shr.u32 %r5, %r4, %r3; +; SM20-NEXT: shl.b32 %r6, %r1, 1; +; SM20-NEXT: not.b32 %r7, %r2; +; SM20-NEXT: and.b32 %r8, %r7, 31; +; SM20-NEXT: shl.b32 %r9, %r6, %r8; +; SM20-NEXT: or.b32 %r10, %r9, %r5; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_right_32( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; +; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1]; +; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2]; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, %r3; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: ret; + %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { +; SM20-LABEL: funnel_shift_left_32( +; SM20: { +; SM20-NEXT: .reg .b32 %r<11>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; +; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2]; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: shl.b32 %r4, %r1, %r3; +; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1]; +; SM20-NEXT: shr.u32 %r6, %r5, 1; +; SM20-NEXT: not.b32 %r7, %r2; +; SM20-NEXT: and.b32 %r8, %r7, 31; +; SM20-NEXT: shr.u32 %r9, %r6, %r8; +; SM20-NEXT: or.b32 %r10, %r4, %r9; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_left_32( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; +; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1]; +; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2]; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, %r3; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: ret; + %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { +; SM20-LABEL: funnel_shift_right_64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, %r2; +; SM20-NEXT: shl.b64 %rd4, %rd1, 1; +; SM20-NEXT: not.b32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd5, %rd4, %r4; +; SM20-NEXT: or.b64 %rd6, %rd5, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_right_64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM35-NEXT: shr.u64 %rd3, %rd2, %r2; +; SM35-NEXT: shl.b64 %rd4, %rd1, 1; +; SM35-NEXT: not.b32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd5, %rd4, %r4; +; SM35-NEXT: or.b64 %rd6, %rd5, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) + ret i64 %val +} + +define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { +; SM20-LABEL: funnel_shift_left_64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM20-NEXT: shr.u64 %rd4, %rd3, 1; +; SM20-NEXT: not.b32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd5, %rd4, %r4; +; SM20-NEXT: or.b64 %rd6, %rd2, %rd5; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_left_64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM35-NEXT: shr.u64 %rd4, %rd3, 1; +; SM35-NEXT: not.b32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd5, %rd4, %r4; +; SM35-NEXT: or.b64 %rd6, %rd2, %rd5; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) + ret i64 %val +} + diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index 64659ce1b5c56..05fdb02ac7479 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -1,25 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) -; CHECK: rotate64 define i64 @rotate64(i64 %a, i32 %b) { -; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 3; -; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61; -; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; -; CHECK: ret +; CHECK-LABEL: rotate64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; +; CHECK-NEXT: shr.u64 %rd2, %rd1, 61; +; CHECK-NEXT: shl.b64 %rd3, %rd1, 3; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3) ret i64 %val } -; CHECK: rotateright64 define i64 @rotateright64(i64 %a, i32 %b) { -; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 61; -; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 3; -; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; -; CHECK: ret +; CHECK-LABEL: rotateright64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; +; CHECK-NEXT: shl.b64 %rd2, %rd1, 61; +; CHECK-NEXT: shr.u64 %rd3, %rd1, 3; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3) ret i64 %val }