diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index ae39217dc8ff8..44f1d6d6d19bd 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15230,6 +15230,62 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of '``llvm.memset.*``', but the generated code is guaranteed not to call any external functions. +.. _int_memset_pattern: + +'``llvm.memset_pattern``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.memset_pattern`` on +any integer bit width and for different address spaces. Not all targets +support all bit widths however. + +:: + + declare void @llvm.memset_pattern.p0.i64.i128(ptr , i128 , + i64 , i1 ) + +Overview: +""""""""" + +The '``llvm.memset_pattern.*``' intrinsics fill a block of memory with +a particular value. This may be expanded to an inline loop, a sequence of +stores, or a libcall depending on what is available for the target and the +expected performance and code size impact. + +Arguments: +"""""""""" + +The first argument is a pointer to the destination to fill, the second +is the value with which to fill it, the third argument is an integer +argument specifying the number of bytes to fill, and the fourth is a boolean +indicating a volatile access. + +The :ref:`align ` parameter attribute can be provided +for the first argument. + +If the ``isvolatile`` parameter is ``true``, the +``llvm.memset_pattern`` call is a :ref:`volatile operation `. The +detailed access behavior is not very cleanly specified and it is unwise to +depend on it. + +Semantics: +"""""""""" + +The '``llvm.memset_pattern.*``' intrinsics fill "len" bytes of memory +starting at the destination location. If the argument is known to be aligned +to some boundary, this can be specified as an attribute on the argument. + +If ```` is not an integer multiple of the pattern width in bytes, then any +remainder bytes will be copied from ````. +If ```` is 0, it is no-op modulo the behavior of attributes attached to +the arguments. +If ```` is not a well-defined value, the behavior is undefined. +If ```` is not zero, ```` should be well-defined, otherwise the +behavior is undefined. + .. _int_sqrt: '``llvm.sqrt.*``' Intrinsic diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h index 311e0ac47ddfa..aa4f0f36e4ed7 100644 --- a/llvm/include/llvm/IR/InstVisitor.h +++ b/llvm/include/llvm/IR/InstVisitor.h @@ -208,6 +208,7 @@ class InstVisitor { RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); } RetTy visitMemSetInst(MemSetInst &I) { DELEGATE(MemIntrinsic); } RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); } + RetTy visitMemSetPatternInst(MemSetPatternInst &I) { DELEGATE(MemSetInst); } RetTy visitMemCpyInst(MemCpyInst &I) { DELEGATE(MemTransferInst); } RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); } RetTy visitMemMoveInst(MemMoveInst &I) { DELEGATE(MemTransferInst); } @@ -295,6 +296,8 @@ class InstVisitor { case Intrinsic::memset: DELEGATE(MemSetInst); case Intrinsic::memset_inline: DELEGATE(MemSetInlineInst); + case Intrinsic::memset_pattern: + DELEGATE(MemSetPatternInst); case Intrinsic::vastart: DELEGATE(VAStartInst); case Intrinsic::vaend: DELEGATE(VAEndInst); case Intrinsic::vacopy: DELEGATE(VACopyInst); diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index a2ecf625ff61a..af8789d4958d8 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1208,6 +1208,7 @@ class MemIntrinsic : public MemIntrinsicBase { case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: case Intrinsic::memcpy_inline: return true; default: @@ -1219,7 +1220,8 @@ class MemIntrinsic : public MemIntrinsicBase { } }; -/// This class wraps the llvm.memset and llvm.memset.inline intrinsics. +/// This class wraps the llvm.memset, llvm.memset.inline, and +/// llvm.memset_pattern intrinsics. class MemSetInst : public MemSetBase { public: // Methods for support type inquiry through isa, cast, and dyn_cast: @@ -1227,6 +1229,7 @@ class MemSetInst : public MemSetBase { switch (I->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: return true; default: return false; @@ -1249,6 +1252,21 @@ class MemSetInlineInst : public MemSetInst { } }; +/// This class wraps the llvm.memset.pattern intrinsic. +class MemSetPatternInst : public MemSetInst { +public: + ConstantInt *getLength() const { + return cast(MemSetInst::getLength()); + } + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memset_pattern; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + /// This class wraps the llvm.memcpy/memmove intrinsics. class MemTransferInst : public MemTransferBase { public: @@ -1328,6 +1346,7 @@ class AnyMemIntrinsic : public MemIntrinsicBase { case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: case Intrinsic::memcpy_element_unordered_atomic: case Intrinsic::memmove_element_unordered_atomic: case Intrinsic::memset_element_unordered_atomic: @@ -1350,6 +1369,7 @@ class AnyMemSetInst : public MemSetBase { switch (I->getIntrinsicID()) { case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: case Intrinsic::memset_element_unordered_atomic: return true; default: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 65a9b68b5229d..f79a19e0a8d29 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1003,6 +1003,14 @@ def int_memset_inline NoCapture>, WriteOnly>, ImmArg>]>; +// Memset variant that writes a given pattern. +def int_memset_pattern + : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_i1_ty], + [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback, + NoCapture>, WriteOnly>, + ImmArg>], "llvm.memset_pattern">; + // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 8572cdc160456..97267afeacef3 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -276,6 +276,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { Memset->eraseFromParent(); break; } + case Intrinsic::memset_pattern: { + auto *Memset = cast(Inst); + expandMemSetAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + break; + } default: llvm_unreachable("unhandled intrinsic"); } @@ -294,6 +301,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: Changed |= expandMemIntrinsicUses(F); break; case Intrinsic::load_relative: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index d156eaec4c172..75f4bf2973919 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5435,7 +5435,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { case Intrinsic::memcpy_inline: case Intrinsic::memmove: case Intrinsic::memset: - case Intrinsic::memset_inline: { + case Intrinsic::memset_inline: + case Intrinsic::memset_pattern: { break; } case Intrinsic::memcpy_element_unordered_atomic: diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 635bd1236196e..27fe98ec363ca 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -133,6 +133,11 @@ static cl::opt UseLIRCodeSizeHeurs( "with -Os/-Oz"), cl::init(true), cl::Hidden); +static cl::opt EnableMemsetPatternIntrinsic( + "loop-idiom-enable-memset-pattern-intrinsic", + cl::desc("Enable use of the memset_pattern intrinsic."), cl::init(false), + cl::Hidden); + namespace { class LoopIdiomRecognize { @@ -300,7 +305,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - if (HasMemset || HasMemsetPattern || HasMemcpy) + if (HasMemset || HasMemsetPattern || EnableMemsetPatternIntrinsic || + HasMemcpy) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -457,7 +463,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) { // It looks like we can use SplatValue. return LegalStoreKind::Memset; } - if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset && + if (!UnorderedAtomic && (HasMemsetPattern || EnableMemsetPatternIntrinsic) && + !DisableLIRP::Memset && // Don't create memset_pattern16s with address spaces. StorePtr->getType()->getPointerAddressSpace() == 0 && getMemSetPatternValue(StoredVal, DL)) { @@ -993,6 +1000,46 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, SCEV::FlagNUW); } +ConstantInt *memSetPatternValueToI128ConstantInt(LLVMContext &Context, + Value *MemSetPatternValue) { + if (auto CIMemSetPatternValue = dyn_cast(MemSetPatternValue)) { + return CIMemSetPatternValue; + } + + if (auto Array = dyn_cast(MemSetPatternValue)) { + Type *ElementType = Array->getElementType(); + unsigned ElementSize = Array->getElementByteSize() * 8; + + APInt Result(128, 0); + unsigned totalBits = 0; + + for (unsigned i = 0; i < Array->getNumElements(); ++i) { + if (totalBits + ElementSize > 128) { + report_fatal_error("Pattern value unexpectedly greater than 128 bits"); + } + + APInt ElementBits; + if (ElementType->isIntegerTy()) { + ElementBits = Array->getElementAsAPInt(i); + } else if (ElementType->isFloatingPointTy()) { + APFloat APF = Array->getElementAsAPFloat(i); + ElementBits = APF.bitcastToAPInt(); + } else { + llvm_unreachable("Unexpected element type"); + } + + // Shift the existing result left by the element's size and OR in the new + // value + Result = (Result << ElementSize) | ElementBits.zextOrTrunc(128); + totalBits += ElementSize; + } + + // Create and return a ConstantInt with the resulting value + return ConstantInt::get(Context, Result); + } + report_fatal_error("Encountered unrecognised type"); +} + /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( @@ -1070,7 +1117,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); - if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) + if (!SplatValue && !(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) || + EnableMemsetPatternIntrinsic)) return Changed; AAMDNodes AATags = TheStore->getAAMetadata(); @@ -1087,24 +1135,44 @@ bool LoopIdiomRecognize::processLoopStridedStore( BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment), /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias); } else { - assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)); - // Everything is emitted in default address space - Type *Int8PtrTy = DestInt8PtrTy; - - StringRef FuncName = "memset_pattern16"; - FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, - Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy); - inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); - - // Otherwise we should form a memset_pattern16. PatternValue is known to be - // an constant array of 16-bytes. Plop the value into a mergable global. - GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, - GlobalValue::PrivateLinkage, - PatternValue, ".memset_pattern"); - GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. - GV->setAlignment(Align(16)); - Value *PatternPtr = GV; - NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); + assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) || + EnableMemsetPatternIntrinsic); + if (EnableMemsetPatternIntrinsic) { + // Everything is emitted in default address space + + // Get or insert the intrinsic declaration + Function *MemsetPatternIntrinsic = Intrinsic::getDeclaration( + M, Intrinsic::memset_pattern, + {DestInt8PtrTy, Builder.getInt128Ty(), Builder.getInt64Ty()}); + + // Create the call to the intrinsic + NewCall = Builder.CreateCall( + MemsetPatternIntrinsic, + {BasePtr, + memSetPatternValueToI128ConstantInt(M->getContext(), PatternValue), + NumBytes, ConstantInt::getFalse(M->getContext())}); + } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + + StringRef FuncName = "memset_pattern16"; + FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16, + Builder.getVoidTy(), Int8PtrTy, + Int8PtrTy, IntIdxTy); + inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI); + + // Otherwise we should form a memset_pattern16. PatternValue is known to + // be an constant array of 16-bytes. Plop the value into a mergable + // global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::PrivateLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr( + GlobalValue::UnnamedAddr::Global); // Ok to merge these. + GV->setAlignment(Align(16)); + Value *PatternPtr = GV; + NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); + } // Set the TBAA info if present. if (AATags.TBAA) diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index d2814f07530d8..8d7c234d16878 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -456,6 +456,109 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, ElseTerm->eraseFromParent(); } +static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr, + Value *CopyLen, Value *SetValue, + Align DstAlign, bool IsVolatile) { + BasicBlock *OrigBB = InsertBefore->getParent(); + Function *F = OrigBB->getParent(); + const DataLayout &DL = F->getDataLayout(); + + if (DL.isBigEndian()) + report_fatal_error("memset_pattern.inline expansion not currently " + "implemented for big-endian targets", + false); + + // To start with, let's assume SetValue is an i128 and bail out if it's not. + if (!isPowerOf2_32(SetValue->getType()->getScalarSizeInBits())) + report_fatal_error("Pattern width for memset_pattern must be a power of 2", + false); + unsigned PatternSize = SetValue->getType()->getScalarSizeInBits() / 8; + + Type *TypeOfCopyLen = CopyLen->getType(); + + BasicBlock *NewBB = OrigBB->splitBasicBlock(InsertBefore, "split"); + BasicBlock *LoopBB = + BasicBlock::Create(F->getContext(), "storeloop", F, NewBB); + BasicBlock *RemCheckBB = + BasicBlock::Create(F->getContext(), "remcheck", F, NewBB); + BasicBlock *RemainderLoopBB = + BasicBlock::Create(F->getContext(), "remainderloop", F, NewBB); + IRBuilder<> Builder(OrigBB->getTerminator()); + + ConstantInt *CILoopOpSize = + ConstantInt::get(dyn_cast(TypeOfCopyLen), PatternSize); + Value *RuntimeLoopCount = + getRuntimeLoopCount(DL, Builder, CopyLen, CILoopOpSize, PatternSize); + Value *RuntimeRemainder = + getRuntimeLoopRemainder(DL, Builder, CopyLen, CILoopOpSize, PatternSize); + + Builder.CreateCondBr(Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), + RuntimeLoopCount), + RemCheckBB, LoopBB); + OrigBB->getTerminator()->eraseFromParent(); + + IRBuilder<> LoopBuilder(LoopBB); + PHINode *CurrentDst = LoopBuilder.CreatePHI(DstAddr->getType(), 0); + CurrentDst->addIncoming(DstAddr, OrigBB); + PHINode *LoopCount = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + LoopCount->addIncoming(RuntimeLoopCount, OrigBB); + + // Create the store instruction for the pattern + LoopBuilder.CreateAlignedStore(SetValue, CurrentDst, DstAlign, IsVolatile); + + Value *NextDst = LoopBuilder.CreateInBoundsGEP( + SetValue->getType(), CurrentDst, + ConstantInt::get(TypeOfCopyLen, PatternSize)); + CurrentDst->addIncoming(NextDst, LoopBB); + + Value *NewLoopCount = + LoopBuilder.CreateSub(LoopCount, ConstantInt::get(TypeOfCopyLen, 1)); + LoopCount->addIncoming(NewLoopCount, LoopBB); + + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpNE(NewLoopCount, + ConstantInt::get(TypeOfCopyLen, 0)), + LoopBB, RemCheckBB); + + IRBuilder<> RemCheckBuilder(RemCheckBB, RemCheckBB->begin()); + // Branch to the end if there are no remainder bytes. + PHINode *RemainderDstPHI = RemCheckBuilder.CreatePHI(NextDst->getType(), 0); + RemainderDstPHI->addIncoming(DstAddr, OrigBB); + RemainderDstPHI->addIncoming(NextDst, LoopBB); + RemCheckBuilder.CreateCondBr( + RemCheckBuilder.CreateICmpEQ(RuntimeRemainder, + ConstantInt::get(TypeOfCopyLen, 0)), + NewBB, RemainderLoopBB); + + // Remainder loop + IRBuilder<> RemainderLoopBuilder(RemainderLoopBB); + PHINode *ByteIndex = RemainderLoopBuilder.CreatePHI(TypeOfCopyLen, 0); + ByteIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), RemCheckBB); + Type *TypeOfSetValue = SetValue->getType(); + PHINode *ShiftedValue = RemainderLoopBuilder.CreatePHI(TypeOfSetValue, 0); + ShiftedValue->addIncoming(SetValue, RemCheckBB); + + Value *ByteToStore = RemainderLoopBuilder.CreateTrunc( + ShiftedValue, RemainderLoopBuilder.getInt8Ty()); + + RemainderLoopBuilder.CreateStore( + ByteToStore, + RemainderLoopBuilder.CreateInBoundsGEP(RemainderLoopBuilder.getInt8Ty(), + RemainderDstPHI, ByteIndex), + IsVolatile); + + Value *NewByteIndex = RemainderLoopBuilder.CreateAdd( + ByteIndex, ConstantInt::get(TypeOfCopyLen, 1)); + ByteIndex->addIncoming(NewByteIndex, RemainderLoopBB); + Value *NewShiftedValue = RemainderLoopBuilder.CreateLShr( + ShiftedValue, ConstantInt::get(TypeOfSetValue, 8)); + ShiftedValue->addIncoming(NewShiftedValue, RemainderLoopBB); + + RemainderLoopBuilder.CreateCondBr( + RemainderLoopBuilder.CreateICmpULT(NewByteIndex, RuntimeRemainder), + RemainderLoopBB, NewBB); +} + static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, Value *CopyLen, Value *SetValue, Align DstAlign, bool IsVolatile) { @@ -591,6 +694,16 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove, } void llvm::expandMemSetAsLoop(MemSetInst *Memset) { + if (isa(Memset)) { + return createMemSetPatternLoop( + /* InsertBefore */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* CopyLen */ Memset->getLength(), + /* SetValue */ Memset->getValue(), + /* Alignment */ Memset->getDestAlign().valueOrOne(), + Memset->isVolatile()); + } + createMemSetLoop(/* InsertBefore */ Memset, /* DstAddr */ Memset->getRawDest(), /* CopyLen */ Memset->getLength(), diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll new file mode 100644 index 0000000000000..ea50ae0b56e40 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll @@ -0,0 +1,591 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST + +define void @memset_1(ptr %a, i128 %value) nounwind { +; RV32-BOTH-LABEL: memset_1: +; RV32-BOTH: # %bb.0: +; RV32-BOTH-NEXT: lw a1, 0(a1) +; RV32-BOTH-NEXT: sb a1, 0(a0) +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: memset_1: +; RV64-BOTH: # %bb.0: +; RV64-BOTH-NEXT: sb a1, 0(a0) +; RV64-BOTH-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0) + ret void +} + +define void @memset_2(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_2: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_2: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_2: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_2: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 2, i1 0) + ret void +} + +define void @memset_3(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_3: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a2, a1, 8 +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: sb a1, 2(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_3: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 8 +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: srli a1, a1, 16 +; RV64-NEXT: sb a1, 2(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_3: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sh a1, 0(a0) +; RV32-FAST-NEXT: srli a1, a1, 16 +; RV32-FAST-NEXT: sb a1, 2(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_3: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sh a1, 0(a0) +; RV64-FAST-NEXT: srli a1, a1, 16 +; RV64-FAST-NEXT: sb a1, 2(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0) + ret void +} + +define void @memset_4(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_4: +; RV32: # %bb.0: +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_4: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_4: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_4: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 4, i1 0) + ret void +} + +define void @memset_5(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_5: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: sb a2, 0(a0) +; RV32-NEXT: sb a1, 4(a0) +; RV32-NEXT: srli a1, a2, 24 +; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: srli a1, a2, 16 +; RV32-NEXT: sb a1, 2(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_5: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a2, a1, 8 +; RV64-NEXT: sb a2, 1(a0) +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: sb a1, 4(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_5: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sb a2, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_5: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: srli a1, a1, 32 +; RV64-FAST-NEXT: sb a1, 4(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 5, i1 0) + ret void +} + +define void @memset_6(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_6: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_6: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_6: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sh a2, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_6: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: srli a1, a1, 32 +; RV64-FAST-NEXT: sh a1, 4(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 6, i1 0) + ret void +} + +define void @memset_7(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_7: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a3, a2, 8 +; RV32-NEXT: sb a3, 5(a0) +; RV32-NEXT: srli a2, a2, 16 +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_7: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 48 +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_7: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sh a2, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: srli a2, a2, 16 +; RV32-FAST-NEXT: sb a2, 6(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_7: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sw a1, 0(a0) +; RV64-FAST-NEXT: srli a2, a1, 48 +; RV64-FAST-NEXT: sb a2, 6(a0) +; RV64-FAST-NEXT: srli a1, a1, 32 +; RV64-FAST-NEXT: sh a1, 4(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 7, i1 0) + ret void +} + +define void @memset_8(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_8: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a3, a2, 24 +; RV32-NEXT: sb a3, 7(a0) +; RV32-NEXT: srli a3, a2, 16 +; RV32-NEXT: sb a3, 6(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_8: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a2, a1, 56 +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: srli a2, a1, 48 +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_8: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_8: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 8, i1 0) + ret void +} + +define void @memset_9(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_9: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a3, 0(a1) +; RV32-NEXT: lw a1, 8(a1) +; RV32-NEXT: sb a2, 4(a0) +; RV32-NEXT: sb a3, 0(a0) +; RV32-NEXT: sb a1, 8(a0) +; RV32-NEXT: srli a1, a2, 24 +; RV32-NEXT: sb a1, 7(a0) +; RV32-NEXT: srli a1, a2, 16 +; RV32-NEXT: sb a1, 6(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 5(a0) +; RV32-NEXT: srli a1, a3, 24 +; RV32-NEXT: sb a1, 3(a0) +; RV32-NEXT: srli a1, a3, 16 +; RV32-NEXT: sb a1, 2(a0) +; RV32-NEXT: srli a3, a3, 8 +; RV32-NEXT: sb a3, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_9: +; RV64: # %bb.0: +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: srli a2, a1, 56 +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: srli a2, a1, 48 +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_9: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 4(a1) +; RV32-FAST-NEXT: lw a3, 0(a1) +; RV32-FAST-NEXT: lw a1, 8(a1) +; RV32-FAST-NEXT: sw a2, 4(a0) +; RV32-FAST-NEXT: sw a3, 0(a0) +; RV32-FAST-NEXT: sb a1, 8(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_9: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sb a2, 8(a0) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 9, i1 0) + ret void +} + +define void @memset_16(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_16: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: sb a3, 8(a0) +; RV32-NEXT: sb a4, 4(a0) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: srli a5, a2, 24 +; RV32-NEXT: sb a5, 15(a0) +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: sb a5, 14(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: srli a2, a3, 24 +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: srli a2, a3, 16 +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: srli a3, a3, 8 +; RV32-NEXT: sb a3, 9(a0) +; RV32-NEXT: srli a2, a4, 24 +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: srli a2, a4, 16 +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: srli a4, a4, 8 +; RV32-NEXT: sb a4, 5(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_16: +; RV64: # %bb.0: +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: srli a3, a2, 56 +; RV64-NEXT: sb a3, 15(a0) +; RV64-NEXT: srli a3, a2, 48 +; RV64-NEXT: sb a3, 14(a0) +; RV64-NEXT: srli a3, a2, 40 +; RV64-NEXT: sb a3, 13(a0) +; RV64-NEXT: srli a3, a2, 32 +; RV64-NEXT: sb a3, 12(a0) +; RV64-NEXT: srli a3, a2, 24 +; RV64-NEXT: sb a3, 11(a0) +; RV64-NEXT: srli a3, a2, 16 +; RV64-NEXT: sb a3, 10(a0) +; RV64-NEXT: srli a2, a2, 8 +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: srli a2, a1, 56 +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: srli a2, a1, 48 +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_16: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_16: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0) + ret void +} + +define void @memset_17(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_17: +; RV32: # %bb.0: +; RV32-NEXT: lw a2, 12(a1) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: sb a2, 12(a0) +; RV32-NEXT: sb a3, 8(a0) +; RV32-NEXT: sb a4, 4(a0) +; RV32-NEXT: sb a1, 0(a0) +; RV32-NEXT: sb a1, 16(a0) +; RV32-NEXT: srli a5, a2, 24 +; RV32-NEXT: sb a5, 15(a0) +; RV32-NEXT: srli a5, a2, 16 +; RV32-NEXT: sb a5, 14(a0) +; RV32-NEXT: srli a2, a2, 8 +; RV32-NEXT: sb a2, 13(a0) +; RV32-NEXT: srli a2, a3, 24 +; RV32-NEXT: sb a2, 11(a0) +; RV32-NEXT: srli a2, a3, 16 +; RV32-NEXT: sb a2, 10(a0) +; RV32-NEXT: srli a3, a3, 8 +; RV32-NEXT: sb a3, 9(a0) +; RV32-NEXT: srli a2, a4, 24 +; RV32-NEXT: sb a2, 7(a0) +; RV32-NEXT: srli a2, a4, 16 +; RV32-NEXT: sb a2, 6(a0) +; RV32-NEXT: srli a4, a4, 8 +; RV32-NEXT: sb a4, 5(a0) +; RV32-NEXT: srli a2, a1, 24 +; RV32-NEXT: sb a2, 3(a0) +; RV32-NEXT: srli a2, a1, 16 +; RV32-NEXT: sb a2, 2(a0) +; RV32-NEXT: srli a1, a1, 8 +; RV32-NEXT: sb a1, 1(a0) +; RV32-NEXT: ret +; +; RV64-LABEL: memset_17: +; RV64: # %bb.0: +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: sb a1, 16(a0) +; RV64-NEXT: srli a3, a2, 56 +; RV64-NEXT: sb a3, 15(a0) +; RV64-NEXT: srli a3, a2, 48 +; RV64-NEXT: sb a3, 14(a0) +; RV64-NEXT: srli a3, a2, 40 +; RV64-NEXT: sb a3, 13(a0) +; RV64-NEXT: srli a3, a2, 32 +; RV64-NEXT: sb a3, 12(a0) +; RV64-NEXT: srli a3, a2, 24 +; RV64-NEXT: sb a3, 11(a0) +; RV64-NEXT: srli a3, a2, 16 +; RV64-NEXT: sb a3, 10(a0) +; RV64-NEXT: srli a2, a2, 8 +; RV64-NEXT: sb a2, 9(a0) +; RV64-NEXT: srli a2, a1, 56 +; RV64-NEXT: sb a2, 7(a0) +; RV64-NEXT: srli a2, a1, 48 +; RV64-NEXT: sb a2, 6(a0) +; RV64-NEXT: srli a2, a1, 40 +; RV64-NEXT: sb a2, 5(a0) +; RV64-NEXT: srli a2, a1, 32 +; RV64-NEXT: sb a2, 4(a0) +; RV64-NEXT: srli a2, a1, 24 +; RV64-NEXT: sb a2, 3(a0) +; RV64-NEXT: srli a2, a1, 16 +; RV64-NEXT: sb a2, 2(a0) +; RV64-NEXT: srli a1, a1, 8 +; RV64-NEXT: sb a1, 1(a0) +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_17: +; RV32-FAST: # %bb.0: +; RV32-FAST-NEXT: lw a2, 12(a1) +; RV32-FAST-NEXT: lw a3, 8(a1) +; RV32-FAST-NEXT: lw a4, 4(a1) +; RV32-FAST-NEXT: lw a1, 0(a1) +; RV32-FAST-NEXT: sw a2, 12(a0) +; RV32-FAST-NEXT: sw a3, 8(a0) +; RV32-FAST-NEXT: sw a4, 4(a0) +; RV32-FAST-NEXT: sw a1, 0(a0) +; RV32-FAST-NEXT: sb a1, 16(a0) +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_17: +; RV64-FAST: # %bb.0: +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: sb a1, 16(a0) +; RV64-FAST-NEXT: ret + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 17, i1 0) + ret void +} + diff --git a/llvm/test/Transforms/LoopIdiom/memset-pattern-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memset-pattern-intrinsic.ll new file mode 100644 index 0000000000000..d20c1a1523786 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/memset-pattern-intrinsic.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="loop-idiom" -loop-idiom-enable-memset-pattern-intrinsic < %s -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +target triple = "x86_64-apple-darwin10.0.0" + + +define dso_local void @double_memset(ptr nocapture %p) { +; CHECK-LABEL: @double_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 + store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !5 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, 16 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + + +define dso_local void @struct_memset(ptr nocapture %p) { +; CHECK-LABEL: @struct_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 + store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, 16 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @var_memset(ptr nocapture %p, i64 %len) { +; CHECK-LABEL: @var_memset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i64 [[LEN:%.*]], 3 +; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 [[TMP0]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07 + store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %len + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define void @test11_pattern(ptr nocapture %P) nounwind ssp { +; CHECK-LABEL: @test11_pattern( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 79228162532711081671548469249, i64 40000, i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[P]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32, ptr %P, i64 %indvar + store i32 1, ptr %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!5 = !{!6, !6, i64 0} +!6 = !{!"double", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} + +!15 = !{!8, i64 0, !"omnipotent char"} +!17 = !{!15, i64 8, !"double"} +!9 = !{!15, i64 32, !"_ZTS1A", !17, i64 0, i64 8, !17, i64 8, i64 8, !17, i64 16, i64 8, !17, i64 24, i64 8} +!10 = !{!9, !17, i64 0, i64 1} + +!18 = !{!19, !20, i64 0} +!19 = !{!"A", !20, i64 0, !22, i64 8} +!20 = !{!"any pointer", !7, i64 0} +!21 = !{!22, !20, i64 0} +!22 = !{!"B", !20, i64 0} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg new file mode 100644 index 0000000000000..bb982488eb15e --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg @@ -0,0 +1,2 @@ +if not "PowerPC" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll new file mode 100644 index 0000000000000..8434ca1c9016b --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll @@ -0,0 +1,8 @@ +; RUN: not opt -mtriple=powerpc64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: memset_pattern.inline expansion not currently implemented for big-endian targets + +define void @memset_pattern_x(ptr %a, i128 %value, i64 %x) nounwind { + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg new file mode 100644 index 0000000000000..17351748513d9 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg @@ -0,0 +1,2 @@ +if not "RISCV" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll new file mode 100644 index 0000000000000..ce4ae0cf14c9e --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern-non-power-of-two-pattern.ll @@ -0,0 +1,8 @@ +; RUN: not opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: Pattern width for memset_pattern must be a power of 2 + +define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind { + tail call void @llvm.memset_pattern.p0.i64.i127(ptr %a, i127 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll new file mode 100644 index 0000000000000..74ca4c4422a75 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s + +define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP4]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ] +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12]] = lshr i128 [[TMP8]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 1 +; CHECK-NEXT: br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 1, i1 0) + ret void +} + +define void @memset_pattern_i128_3(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_3( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP4]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ] +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12]] = lshr i128 [[TMP8]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 3 +; CHECK-NEXT: br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 3, i1 0) + ret void +} + +define void @memset_pattern_i128_14(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_14( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 true, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP4]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ] +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12]] = lshr i128 [[TMP8]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 14 +; CHECK-NEXT: br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 14, i1 0) + ret void +} + +define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_16( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 1, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP4]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ] +; CHECK-NEXT: br i1 true, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12]] = lshr i128 [[TMP8]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 16, i1 0) + ret void +} + +define void @memset_pattern_i128_38(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_38( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 false, label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 2, [[TMP0]] ], [ [[TMP4:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = getelementptr inbounds i128, ptr [[TMP1]], i64 16 +; CHECK-NEXT: [[TMP4]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP3]], %[[STORELOOP]] ] +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP11:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP12:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i128 [[TMP8]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: store i8 [[TMP9]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[TMP11]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP12]] = lshr i128 [[TMP8]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult i64 [[TMP11]], 6 +; CHECK-NEXT: br i1 [[TMP13]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 38, i1 0) + ret void +} + +define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_x( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[X]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[X]], 15 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 0, [[TMP1]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ [[TMP1]], [[TMP0]] ], [ [[TMP7:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6]] = getelementptr inbounds i128, ptr [[TMP4]], i64 16 +; CHECK-NEXT: [[TMP7]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP6]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP15:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i128 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP16:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i128 [[TMP12]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-NEXT: store i8 [[TMP13]], ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15]] = add i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP16]] = lshr i128 [[TMP12]], 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult i64 [[TMP15]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i128(ptr %a, i128 %value, i64 %x, i1 0) + ret void +} + +define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i256_x( +; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[X]], 5 +; CHECK-NEXT: [[TMP2:%.*]] = and i64 [[X]], 31 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 0, [[TMP1]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[REMCHECK:.*]], label %[[STORELOOP:.*]] +; CHECK: [[STORELOOP]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[A]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ [[TMP1]], [[TMP0]] ], [ [[TMP7:%.*]], %[[STORELOOP]] ] +; CHECK-NEXT: store i256 [[VALUE]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6]] = getelementptr inbounds i256, ptr [[TMP4]], i64 32 +; CHECK-NEXT: [[TMP7]] = sub i64 [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label %[[STORELOOP]], label %[[REMCHECK]] +; CHECK: [[REMCHECK]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[A]], [[TMP0]] ], [ [[TMP6]], %[[STORELOOP]] ] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label %[[SPLIT:.*]], label %[[REMAINDERLOOP:.*]] +; CHECK: [[REMAINDERLOOP]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[REMCHECK]] ], [ [[TMP15:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi i256 [ [[VALUE]], %[[REMCHECK]] ], [ [[TMP16:%.*]], %[[REMAINDERLOOP]] ] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i256 [[TMP12]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-NEXT: store i8 [[TMP13]], ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15]] = add i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP16]] = lshr i256 [[TMP12]], 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp ult i64 [[TMP15]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[REMAINDERLOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.memset_pattern.p0.i64.i256(ptr %a, i256 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll index b1b9f7ee4be11..37745c6a9acee 100644 --- a/llvm/test/Verifier/intrinsic-immarg.ll +++ b/llvm/test/Verifier/intrinsic-immarg.ll @@ -72,6 +72,16 @@ define void @memset_inline_is_volatile(ptr %dest, i8 %value, i1 %is.volatile) { } +declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) +define void @memset_pattern_is_volatile(ptr %dest, i32 %value, i1 %is.volatile) { + ; CHECK: immarg operand has non-immediate parameter + ; CHECK-NEXT: i1 %is.volatile + ; CHECK-NEXT: call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile) + call void @llvm.memset_pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile) + ret void +} + + declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) define void @objectsize(ptr %ptr, i1 %a, i1 %b, i1 %c) { ; CHECK: immarg operand has non-immediate parameter diff --git a/llvm/test/Verifier/memset-pattern-inline.ll b/llvm/test/Verifier/memset-pattern-inline.ll new file mode 100644 index 0000000000000..7f2e01ef99ea5 --- /dev/null +++ b/llvm/test/Verifier/memset-pattern-inline.ll @@ -0,0 +1,9 @@ +; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s + +; CHECK: alignment is not a power of two + +define void @foo(ptr %P, i32 %value) { + call void @llvm.memset_pattern.p0.i32.i32(ptr align 3 %P, i32 %value, i32 4, i1 false) + ret void +} +declare void @llvm.memset_pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) nounwind