diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index a956e6869bf90..abba51770693f 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15434,6 +15434,63 @@ The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of '``llvm.memset.*``', but the generated code is guaranteed not to call any external functions. +.. _int_experimental_memset_pattern: + +'``llvm.experimental.memset.pattern``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use +``llvm.experimental.memset.pattern`` on any integer bit width and for +different address spaces. Not all targets support all bit widths however. + +:: + + declare void @llvm.experimental.memset.pattern.p0.i128.i64(ptr , i128 , + i64 , i1 ) + +Overview: +""""""""" + +The '``llvm.experimental.memset.pattern.*``' intrinsics fill a block of memory +with a particular value. This may be expanded to an inline loop, a sequence of +stores, or a libcall depending on what is available for the target and the +expected performance and code size impact. + +Arguments: +"""""""""" + +The first argument is a pointer to the destination to fill, the second +is the value with which to fill it, the third argument is an integer +argument specifying the number of times to fill the value, and the fourth is a +boolean indicating a volatile access. + +The :ref:`align ` parameter attribute can be provided +for the first argument. + +If the ``isvolatile`` parameter is ``true``, the +``llvm.experimental.memset.pattern`` call is a :ref:`volatile operation +`. The detailed access behavior is not very cleanly specified and it +is unwise to depend on it. + +Semantics: +"""""""""" + +The '``llvm.experimental.memset.pattern*``' intrinsic fills memory starting at +the destination location with the given pattern ```` times, +incrementing by the allocation size of the type each time. The stores follow +the usual semantics of store instructions, including regarding endianness and +padding. If the argument is known to be aligned to some boundary, this can be +specified as an attribute on the argument. + +If ```` is 0, it is no-op modulo the behavior of attributes attached to +the arguments. +If ```` is not a well-defined value, the behavior is undefined. +If ```` is not zero, ```` should be well-defined, otherwise the +behavior is undefined. + .. _int_sqrt: '``llvm.sqrt.*``' Intrinsic diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h index 311e0ac47ddfa..5fc6fbfd0f28e 100644 --- a/llvm/include/llvm/IR/InstVisitor.h +++ b/llvm/include/llvm/IR/InstVisitor.h @@ -208,6 +208,9 @@ class InstVisitor { RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); } RetTy visitMemSetInst(MemSetInst &I) { DELEGATE(MemIntrinsic); } RetTy visitMemSetInlineInst(MemSetInlineInst &I){ DELEGATE(MemSetInst); } + RetTy visitMemSetPatternInst(MemSetPatternInst &I) { + DELEGATE(IntrinsicInst); + } RetTy visitMemCpyInst(MemCpyInst &I) { DELEGATE(MemTransferInst); } RetTy visitMemCpyInlineInst(MemCpyInlineInst &I){ DELEGATE(MemCpyInst); } RetTy visitMemMoveInst(MemMoveInst &I) { DELEGATE(MemTransferInst); } @@ -295,6 +298,8 @@ class InstVisitor { case Intrinsic::memset: DELEGATE(MemSetInst); case Intrinsic::memset_inline: DELEGATE(MemSetInlineInst); + case Intrinsic::experimental_memset_pattern: + DELEGATE(MemSetPatternInst); case Intrinsic::vastart: DELEGATE(VAStartInst); case Intrinsic::vaend: DELEGATE(VAEndInst); case Intrinsic::vacopy: DELEGATE(VACopyInst); diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 920eed01374c8..3436216d478e3 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1263,6 +1263,41 @@ class MemSetInlineInst : public MemSetInst { } }; +/// This is the base class for llvm.experimental.memset.pattern +class MemSetPatternIntrinsic : public MemIntrinsicBase { +private: + enum { ARG_VOLATILE = 3 }; + +public: + ConstantInt *getVolatileCst() const { + return cast(const_cast(getArgOperand(ARG_VOLATILE))); + } + + bool isVolatile() const { return !getVolatileCst()->isZero(); } + + void setVolatile(Constant *V) { setArgOperand(ARG_VOLATILE, V); } + + // Methods for support of type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::experimental_memset_pattern; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + +/// This class wraps the llvm.experimental.memset.pattern intrinsic. +class MemSetPatternInst : public MemSetBase { +public: + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::experimental_memset_pattern; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + /// This class wraps the llvm.memcpy/memmove intrinsics. class MemTransferInst : public MemTransferBase { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 8ed57f818d600..e46335688065a 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1006,6 +1006,17 @@ def int_memset_inline NoCapture>, WriteOnly>, ImmArg>]>; +// Memset variant that writes a given pattern. +def int_experimental_memset_pattern + : Intrinsic<[], + [llvm_anyptr_ty, // Destination. + llvm_anyint_ty, // Pattern value. + llvm_anyint_ty, // Count (number of times to fill value). + llvm_i1_ty], // IsVolatile. + [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, IntrNoCallback, + NoCapture>, WriteOnly>, + ImmArg>]>; + // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h index 314435324b473..1007d282b2ac5 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -25,6 +25,7 @@ class Instruction; class MemCpyInst; class MemMoveInst; class MemSetInst; +class MemSetPatternInst; class ScalarEvolution; class TargetTransformInfo; class Value; @@ -57,6 +58,9 @@ bool expandMemMoveAsLoop(MemMoveInst *MemMove, const TargetTransformInfo &TTI); /// Expand \p MemSet as a loop. \p MemSet is not deleted. void expandMemSetAsLoop(MemSetInst *MemSet); +/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted. +void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet); + /// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted. void expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemCpy, const TargetTransformInfo &TTI, diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 3373b76edb268..4a3d1673c2a7c 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -320,6 +320,13 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { Memset->eraseFromParent(); break; } + case Intrinsic::experimental_memset_pattern: { + auto *Memset = cast(Inst); + expandMemSetPatternAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + break; + } default: llvm_unreachable("unhandled intrinsic"); } @@ -339,6 +346,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::memset_inline: + case Intrinsic::experimental_memset_pattern: Changed |= expandMemIntrinsicUses(F); break; case Intrinsic::load_relative: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 85e60452b75c3..791391698ac27 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5519,7 +5519,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { case Intrinsic::memcpy_inline: case Intrinsic::memmove: case Intrinsic::memset: - case Intrinsic::memset_inline: { + case Intrinsic::memset_inline: + case Intrinsic::experimental_memset_pattern: { break; } case Intrinsic::memcpy_element_unordered_atomic: diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 546217093550a..91291b429ea43 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -970,6 +970,15 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) { Memset->isVolatile()); } +void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) { + createMemSetLoop(/* InsertBefore=*/Memset, + /* DstAddr=*/Memset->getRawDest(), + /* CopyLen=*/Memset->getLength(), + /* SetValue=*/Memset->getValue(), + /* Alignment=*/Memset->getDestAlign().valueOrOne(), + Memset->isVolatile()); +} + void llvm::expandAtomicMemCpyAsLoop(AtomicMemCpyInst *AtomicMemcpy, const TargetTransformInfo &TTI, ScalarEvolution *SE) { diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll new file mode 100644 index 0000000000000..14bdad0a88af4 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll @@ -0,0 +1,297 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \ +; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST + +; TODO: Due to the initial naive lowering implementation of memset.pattern in +; PreISelIntrinsicLowering, the generated code is not good. + +define void @memset_1(ptr %a, i128 %value) nounwind { +; RV32-BOTH-LABEL: memset_1: +; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader +; RV32-BOTH-NEXT: lw a2, 0(a1) +; RV32-BOTH-NEXT: lw a3, 4(a1) +; RV32-BOTH-NEXT: lw a4, 8(a1) +; RV32-BOTH-NEXT: lw a1, 12(a1) +; RV32-BOTH-NEXT: li a5, 0 +; RV32-BOTH-NEXT: li a6, 0 +; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop +; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-BOTH-NEXT: slli a7, a5, 4 +; RV32-BOTH-NEXT: add a7, a0, a7 +; RV32-BOTH-NEXT: addi a5, a5, 1 +; RV32-BOTH-NEXT: seqz t0, a5 +; RV32-BOTH-NEXT: add a6, a6, t0 +; RV32-BOTH-NEXT: or t0, a5, a6 +; RV32-BOTH-NEXT: sw a2, 0(a7) +; RV32-BOTH-NEXT: sw a3, 4(a7) +; RV32-BOTH-NEXT: sw a4, 8(a7) +; RV32-BOTH-NEXT: sw a1, 12(a7) +; RV32-BOTH-NEXT: beqz t0, .LBB0_1 +; RV32-BOTH-NEXT: # %bb.2: # %split +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: memset_1: +; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader +; RV64-BOTH-NEXT: addi a3, a0, 16 +; RV64-BOTH-NEXT: .LBB0_1: # %loadstoreloop +; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: addi a0, a0, 16 +; RV64-BOTH-NEXT: bne a0, a3, .LBB0_1 +; RV64-BOTH-NEXT: # %bb.2: # %split +; RV64-BOTH-NEXT: ret + tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0) + ret void +} + +define void @memset_1_noalign(ptr %a, i128 %value) nounwind { +; RV32-LABEL: memset_1_noalign: +; RV32: # %bb.0: # %loadstoreloop.preheader +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: li a2, 0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a6, 8(a1) +; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: srli a7, a4, 24 +; RV32-NEXT: srli t0, a4, 16 +; RV32-NEXT: srli t1, a4, 8 +; RV32-NEXT: srli t2, a5, 24 +; RV32-NEXT: srli t3, a5, 16 +; RV32-NEXT: srli t4, a5, 8 +; RV32-NEXT: srli t5, a6, 24 +; RV32-NEXT: srli t6, a6, 16 +; RV32-NEXT: srli s0, a6, 8 +; RV32-NEXT: srli s1, a1, 24 +; RV32-NEXT: srli s2, a1, 16 +; RV32-NEXT: srli s3, a1, 8 +; RV32-NEXT: .LBB1_1: # %loadstoreloop +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli s4, a2, 4 +; RV32-NEXT: add s4, a0, s4 +; RV32-NEXT: sb a4, 4(s4) +; RV32-NEXT: sb t1, 5(s4) +; RV32-NEXT: sb t0, 6(s4) +; RV32-NEXT: sb a7, 7(s4) +; RV32-NEXT: sb a5, 0(s4) +; RV32-NEXT: sb t4, 1(s4) +; RV32-NEXT: sb t3, 2(s4) +; RV32-NEXT: sb t2, 3(s4) +; RV32-NEXT: sb a6, 8(s4) +; RV32-NEXT: sb s0, 9(s4) +; RV32-NEXT: sb t6, 10(s4) +; RV32-NEXT: sb t5, 11(s4) +; RV32-NEXT: addi a2, a2, 1 +; RV32-NEXT: seqz s5, a2 +; RV32-NEXT: add a3, a3, s5 +; RV32-NEXT: or s5, a2, a3 +; RV32-NEXT: sb a1, 12(s4) +; RV32-NEXT: sb s3, 13(s4) +; RV32-NEXT: sb s2, 14(s4) +; RV32-NEXT: sb s1, 15(s4) +; RV32-NEXT: beqz s5, .LBB1_1 +; RV32-NEXT: # %bb.2: # %split +; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: memset_1_noalign: +; RV64: # %bb.0: # %loadstoreloop.preheader +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: addi a3, a0, 16 +; RV64-NEXT: srli a4, a1, 56 +; RV64-NEXT: srli a5, a1, 48 +; RV64-NEXT: srli a6, a1, 40 +; RV64-NEXT: srli a7, a1, 32 +; RV64-NEXT: srli t0, a1, 24 +; RV64-NEXT: srli t1, a1, 16 +; RV64-NEXT: srli t2, a1, 8 +; RV64-NEXT: srli t3, a2, 56 +; RV64-NEXT: srli t4, a2, 48 +; RV64-NEXT: srli t5, a2, 40 +; RV64-NEXT: srli t6, a2, 32 +; RV64-NEXT: srli s0, a2, 24 +; RV64-NEXT: srli s1, a2, 16 +; RV64-NEXT: srli s2, a2, 8 +; RV64-NEXT: .LBB1_1: # %loadstoreloop +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: sb a7, 4(a0) +; RV64-NEXT: sb a6, 5(a0) +; RV64-NEXT: sb a5, 6(a0) +; RV64-NEXT: sb a4, 7(a0) +; RV64-NEXT: sb a1, 0(a0) +; RV64-NEXT: sb t2, 1(a0) +; RV64-NEXT: sb t1, 2(a0) +; RV64-NEXT: sb t0, 3(a0) +; RV64-NEXT: sb t6, 12(a0) +; RV64-NEXT: sb t5, 13(a0) +; RV64-NEXT: sb t4, 14(a0) +; RV64-NEXT: sb t3, 15(a0) +; RV64-NEXT: sb a2, 8(a0) +; RV64-NEXT: sb s2, 9(a0) +; RV64-NEXT: sb s1, 10(a0) +; RV64-NEXT: sb s0, 11(a0) +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: bne a0, a3, .LBB1_1 +; RV64-NEXT: # %bb.2: # %split +; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret +; +; RV32-FAST-LABEL: memset_1_noalign: +; RV32-FAST: # %bb.0: # %loadstoreloop.preheader +; RV32-FAST-NEXT: lw a2, 0(a1) +; RV32-FAST-NEXT: lw a3, 4(a1) +; RV32-FAST-NEXT: lw a4, 8(a1) +; RV32-FAST-NEXT: lw a1, 12(a1) +; RV32-FAST-NEXT: li a5, 0 +; RV32-FAST-NEXT: li a6, 0 +; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop +; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-FAST-NEXT: slli a7, a5, 4 +; RV32-FAST-NEXT: add a7, a0, a7 +; RV32-FAST-NEXT: addi a5, a5, 1 +; RV32-FAST-NEXT: seqz t0, a5 +; RV32-FAST-NEXT: add a6, a6, t0 +; RV32-FAST-NEXT: or t0, a5, a6 +; RV32-FAST-NEXT: sw a2, 0(a7) +; RV32-FAST-NEXT: sw a3, 4(a7) +; RV32-FAST-NEXT: sw a4, 8(a7) +; RV32-FAST-NEXT: sw a1, 12(a7) +; RV32-FAST-NEXT: beqz t0, .LBB1_1 +; RV32-FAST-NEXT: # %bb.2: # %split +; RV32-FAST-NEXT: ret +; +; RV64-FAST-LABEL: memset_1_noalign: +; RV64-FAST: # %bb.0: # %loadstoreloop.preheader +; RV64-FAST-NEXT: addi a3, a0, 16 +; RV64-FAST-NEXT: .LBB1_1: # %loadstoreloop +; RV64-FAST-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-FAST-NEXT: sd a1, 0(a0) +; RV64-FAST-NEXT: sd a2, 8(a0) +; RV64-FAST-NEXT: addi a0, a0, 16 +; RV64-FAST-NEXT: bne a0, a3, .LBB1_1 +; RV64-FAST-NEXT: # %bb.2: # %split +; RV64-FAST-NEXT: ret + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0) + ret void +} + +define void @memset_4(ptr %a, i128 %value) nounwind { +; RV32-BOTH-LABEL: memset_4: +; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader +; RV32-BOTH-NEXT: lw a2, 0(a1) +; RV32-BOTH-NEXT: lw a3, 4(a1) +; RV32-BOTH-NEXT: lw a4, 8(a1) +; RV32-BOTH-NEXT: lw a1, 12(a1) +; RV32-BOTH-NEXT: li a5, 0 +; RV32-BOTH-NEXT: li a6, 0 +; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop +; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-BOTH-NEXT: slli a7, a5, 4 +; RV32-BOTH-NEXT: add a7, a0, a7 +; RV32-BOTH-NEXT: addi a5, a5, 1 +; RV32-BOTH-NEXT: seqz t0, a5 +; RV32-BOTH-NEXT: add a6, a6, t0 +; RV32-BOTH-NEXT: seqz t0, a6 +; RV32-BOTH-NEXT: sltiu t1, a5, 4 +; RV32-BOTH-NEXT: and t0, t0, t1 +; RV32-BOTH-NEXT: sw a2, 0(a7) +; RV32-BOTH-NEXT: sw a3, 4(a7) +; RV32-BOTH-NEXT: sw a4, 8(a7) +; RV32-BOTH-NEXT: sw a1, 12(a7) +; RV32-BOTH-NEXT: bnez t0, .LBB2_1 +; RV32-BOTH-NEXT: # %bb.2: # %split +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: memset_4: +; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader +; RV64-BOTH-NEXT: addi a3, a0, 64 +; RV64-BOTH-NEXT: .LBB2_1: # %loadstoreloop +; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: addi a0, a0, 16 +; RV64-BOTH-NEXT: bne a0, a3, .LBB2_1 +; RV64-BOTH-NEXT: # %bb.2: # %split +; RV64-BOTH-NEXT: ret + tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0) + ret void +} + +define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind { +; RV32-BOTH-LABEL: memset_x: +; RV32-BOTH: # %bb.0: +; RV32-BOTH-NEXT: or a4, a2, a3 +; RV32-BOTH-NEXT: beqz a4, .LBB3_5 +; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader +; RV32-BOTH-NEXT: lw a4, 0(a1) +; RV32-BOTH-NEXT: lw a5, 4(a1) +; RV32-BOTH-NEXT: lw a6, 8(a1) +; RV32-BOTH-NEXT: lw a1, 12(a1) +; RV32-BOTH-NEXT: li a7, 0 +; RV32-BOTH-NEXT: li t0, 0 +; RV32-BOTH-NEXT: j .LBB3_3 +; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop +; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1 +; RV32-BOTH-NEXT: sltu t1, t0, a3 +; RV32-BOTH-NEXT: beqz t1, .LBB3_5 +; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop +; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-BOTH-NEXT: slli t1, a7, 4 +; RV32-BOTH-NEXT: add t1, a0, t1 +; RV32-BOTH-NEXT: addi a7, a7, 1 +; RV32-BOTH-NEXT: seqz t2, a7 +; RV32-BOTH-NEXT: add t0, t0, t2 +; RV32-BOTH-NEXT: sw a4, 0(t1) +; RV32-BOTH-NEXT: sw a5, 4(t1) +; RV32-BOTH-NEXT: sw a6, 8(t1) +; RV32-BOTH-NEXT: sw a1, 12(t1) +; RV32-BOTH-NEXT: bne t0, a3, .LBB3_2 +; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 +; RV32-BOTH-NEXT: sltu t1, a7, a2 +; RV32-BOTH-NEXT: bnez t1, .LBB3_3 +; RV32-BOTH-NEXT: .LBB3_5: # %split +; RV32-BOTH-NEXT: ret +; +; RV64-BOTH-LABEL: memset_x: +; RV64-BOTH: # %bb.0: +; RV64-BOTH-NEXT: beqz a3, .LBB3_3 +; RV64-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader +; RV64-BOTH-NEXT: li a4, 0 +; RV64-BOTH-NEXT: .LBB3_2: # %loadstoreloop +; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: sd a2, 8(a0) +; RV64-BOTH-NEXT: addi a4, a4, 1 +; RV64-BOTH-NEXT: addi a0, a0, 16 +; RV64-BOTH-NEXT: bltu a4, a3, .LBB3_2 +; RV64-BOTH-NEXT: .LBB3_3: # %split +; RV64-BOTH-NEXT: ret + tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg new file mode 100644 index 0000000000000..bb982488eb15e --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/lit.local.cfg @@ -0,0 +1,2 @@ +if not "PowerPC" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll new file mode 100644 index 0000000000000..1f77c4a605106 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=powerpc64 -passes=pre-isel-intrinsic-lowering -S -o - %s 2>&1 | FileCheck %s + +; Simple smoke test that memset.pattern is still expanded on big endian +; targets. + +define void @memset.pattern(ptr %a, i128 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset.pattern( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg new file mode 100644 index 0000000000000..17351748513d9 --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/lit.local.cfg @@ -0,0 +1,2 @@ +if not "RISCV" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll new file mode 100644 index 0000000000000..d3ef9fe4cefbd --- /dev/null +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s + +define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_1( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0) + ret void +} + +define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_16( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16 +; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 16, i1 0) + ret void +} + +define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i127_x( +; CHECK-SAME: ptr [[A:%.*]], i127 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i127, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: store i127 [[VALUE]], ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i127 %value, i64 %x, i1 0) + ret void +} + +define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i128_x( +; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 %x, i1 0) + ret void +} + +define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i256_x( +; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i256, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: store i256 [[VALUE]], ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: ret void +; + tail call void @llvm.experimental.memset.pattern(ptr %a, i256 %value, i64 %x, i1 0) + ret void +} + +; The common alignment of the allocation of the pattern stride (its allocation +; size) and the destination pointer should be used. +define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind { +; CHECK-LABEL: define void @memset_pattern_i15_x_alignment( +; CHECK-SAME: ptr [[A:%.*]], i15 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]] +; CHECK: [[LOADSTORELOOP]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: store i15 [[VALUE]], ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]] +; CHECK: [[SPLIT]]: +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 0, [[X]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]] +; CHECK: [[LOADSTORELOOP2]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: store i15 [[VALUE]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP11]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP9]], [[X]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[LOADSTORELOOP2]], label %[[SPLIT1]] +; CHECK: [[SPLIT1]]: +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern(ptr align 1 %a, i15 %value, i64 %x, i1 0) + call void @llvm.experimental.memset.pattern(ptr align 2 %a, i15 %value, i64 %x, i1 0) + ret void +} diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll index e310cd2e0b781..ab1286e8a3d76 100644 --- a/llvm/test/Verifier/intrinsic-immarg.ll +++ b/llvm/test/Verifier/intrinsic-immarg.ll @@ -63,6 +63,14 @@ define void @memset_inline_is_volatile(ptr %dest, i8 %value, i1 %is.volatile) { ret void } +declare void @llvm.experimental.memset.pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) +define void @memset_pattern_is_volatile(ptr %dest, i32 %value, i1 %is.volatile) { + ; CHECK: immarg operand has non-immediate parameter + ; CHECK-NEXT: i1 %is.volatile + ; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile) + call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr %dest, i32 %value, i32 8, i1 %is.volatile) + ret void +} declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) define void @objectsize(ptr %ptr, i1 %a, i1 %b, i1 %c) { diff --git a/llvm/test/Verifier/memset-pattern.ll b/llvm/test/Verifier/memset-pattern.ll new file mode 100644 index 0000000000000..7f5301976b749 --- /dev/null +++ b/llvm/test/Verifier/memset-pattern.ll @@ -0,0 +1,9 @@ +; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s + +; CHECK: alignment is not a power of two + +define void @foo(ptr %P, i32 %value) { + call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr align 3 %P, i32 %value, i32 4, i1 false) + ret void +} +declare void @llvm.experimental.memset.pattern.p0.i32.i32(ptr nocapture, i32, i32, i1) nounwind