Skip to content

Reapply "[X86] For minsize memset/memcpy, use byte or double-word accesses (#87003)" #111393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 8, 2024

Conversation

AZero13
Copy link
Contributor

@AZero13 AZero13 commented Oct 7, 2024

Restore old Val if bytes are left over to prevent an assertion failure.

@llvmbot
Copy link
Member

llvmbot commented Oct 7, 2024

@llvm/pr-subscribers-backend-x86

Author: Rose (AreaZR)

Changes

Restore old Val if bytes are left over to prevent an assertion failure.


Patch is 22.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111393.diff

5 Files Affected:

  • (modified) llvm/lib/Target/X86/X86SelectionDAGInfo.cpp (+151-101)
  • (modified) llvm/test/CodeGen/X86/memcpy-struct-by-value.ll (+4-4)
  • (modified) llvm/test/CodeGen/X86/memcpy.ll (+12-8)
  • (modified) llvm/test/CodeGen/X86/memset-minsize.ll (+30-42)
  • (modified) llvm/test/CodeGen/X86/memset-vs-memset-inline.ll (+11)
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 055466ac660ccc..eb245e9e6a510b 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -28,6 +28,23 @@ static cl::opt<bool>
     UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
                      cl::desc("Use fast short rep mov in memcpy lowering"));
 
+/// Returns the best type to use with repmovs/repstos depending on alignment.
+static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
+  uint64_t Align = Alignment.value();
+  assert((Align != 0) && "Align is normalized");
+  assert(isPowerOf2_64(Align) && "Align is a power of 2");
+  switch (Align) {
+  case 1:
+    return MVT::i8;
+  case 2:
+    return MVT::i16;
+  case 4:
+    return MVT::i32;
+  default:
+    return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+  }
+}
+
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,92 +61,120 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
 }
 
-SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
-    MachinePointerInfo DstPtrInfo) const {
-  // If to a segment-relative address space, use the default lowering.
-  if (DstPtrInfo.getAddrSpace() >= 256)
-    return SDValue();
+/// Emit a single REP STOSB instruction for a particular constant size.
+static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl, SDValue Chain, SDValue Dst,
+                           SDValue Val, SDValue Size, MVT AVT) {
+  const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
+  unsigned AX = X86::AL;
+  switch (AVT.getSizeInBits()) {
+  case 8:
+    AX = X86::AL;
+    break;
+  case 16:
+    AX = X86::AX;
+    break;
+  case 32:
+    AX = X86::EAX;
+    break;
+  default:
+    AX = X86::RAX;
+    break;
+  }
 
-  // If the base register might conflict with our physical registers, bail out.
-  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
-                                  X86::ECX, X86::EAX, X86::EDI};
-  if (isBaseRegConflictPossible(DAG, ClobberSet))
-    return SDValue();
+  const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
+  const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
 
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  const X86Subtarget &Subtarget =
-      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+  SDValue InGlue;
+  Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
+  InGlue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
+  InGlue = Chain.getValue(1);
+  Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
+  InGlue = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
+  return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+}
+
+/// Emit a single REP STOSB instruction for a particular constant size.
+static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                            const SDLoc &dl, SDValue Chain, SDValue Dst,
+                            SDValue Val, uint64_t Size) {
+  return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
+                     DAG.getIntPtrConstant(Size, dl), MVT::i8);
+}
+
+/// Returns a REP STOS instruction, possibly with a few load/stores to implement
+/// a constant size memory set. In some cases where we know REP MOVS is
+/// inefficient we return an empty SDValue so the calling code can either
+/// generate a store sequence or call the runtime memset function.
+static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
+                                       const X86Subtarget &Subtarget,
+                                       const SDLoc &dl, SDValue Chain,
+                                       SDValue Dst, SDValue Val, uint64_t Size,
+                                       EVT SizeVT, Align Alignment,
+                                       bool isVolatile, bool AlwaysInline,
+                                       MachinePointerInfo DstPtrInfo) {
+  /// In case we optimize for size, we use repstosb even if it's less efficient
+  /// so we can save the loads/stores of the leftover.
+  if (DAG.getMachineFunction().getFunction().hasMinSize()) {
+    if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
+      // Special case 0 because otherwise we get large literals,
+      // which causes larger encoding.
+      if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
+        MVT BlockType = MVT::i32;
+        const uint64_t BlockBits = BlockType.getSizeInBits();
+        const uint64_t BlockBytes = BlockBits / 8;
+        const uint64_t BlockCount = Size / BlockBytes;
+
+        Val = DAG.getConstant(0, dl, BlockType);
+        // repstosd is same size as repstosb
+        return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
+                           DAG.getIntPtrConstant(BlockCount, dl), BlockType);
+      }
+    }
+    return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
+  }
+
+  if (Size > Subtarget.getMaxInlineSizeThreshold())
+    return SDValue();
 
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if (Alignment < Align(4) || !ConstantSize ||
-      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
+  if (Alignment < Align(4))
     return SDValue();
 
-  uint64_t SizeVal = ConstantSize->getZExtValue();
-  SDValue InGlue;
-  EVT AVT;
-  SDValue Count;
-  unsigned BytesLeft = 0;
+  MVT BlockType = MVT::i8;
+  uint64_t BlockCount = Size;
+  uint64_t BytesLeft = 0;
+
+  SDValue ValRaw = Val;
   if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
-    unsigned ValReg;
-    uint64_t Val = ValC->getZExtValue() & 255;
-
-    // If the value is a constant, then we can potentially use larger sets.
-    if (Alignment >= Align(4)) {
-      // DWORD aligned
-      AVT = MVT::i32;
-      ValReg = X86::EAX;
-      Val = (Val << 8)  | Val;
-      Val = (Val << 16) | Val;
-      if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
-        AVT = MVT::i64;
-        ValReg = X86::RAX;
-        Val = (Val << 32) | Val;
-      }
-    } else if (Alignment == Align(2)) {
-      // WORD aligned
-      AVT = MVT::i16;
-      ValReg = X86::AX;
-      Val = (Val << 8) | Val;
-    } else {
-      // Byte aligned
-      AVT = MVT::i8;
-      ValReg = X86::AL;
-      Count = DAG.getIntPtrConstant(SizeVal, dl);
-    }
+    BlockType = getOptimalRepType(Subtarget, Alignment);
+    uint64_t Value = ValC->getZExtValue() & 255;
+    const uint64_t BlockBits = BlockType.getSizeInBits();
 
-    if (AVT.bitsGT(MVT::i8)) {
-      unsigned UBytes = AVT.getSizeInBits() / 8;
-      Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
-      BytesLeft = SizeVal % UBytes;
-    }
+    if (BlockBits >= 16)
+      Value = (Value << 8) | Value;
 
-    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
-                             InGlue);
-    InGlue = Chain.getValue(1);
-  } else {
-    AVT = MVT::i8;
-    Count  = DAG.getIntPtrConstant(SizeVal, dl);
-    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
-    InGlue = Chain.getValue(1);
-  }
+    if (BlockBits >= 32)
+      Value = (Value << 16) | Value;
 
-  bool Use64BitRegs = Subtarget.isTarget64BitLP64();
-  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
-                           Count, InGlue);
-  InGlue = Chain.getValue(1);
-  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
-                           Dst, InGlue);
-  InGlue = Chain.getValue(1);
+    if (BlockBits >= 64)
+      Value = (Value << 32) | Value;
 
-  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
-  SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
+    const uint64_t BlockBytes = BlockBits / 8;
+    BlockCount = Size / BlockBytes;
+    BytesLeft = Size % BlockBytes;
+    Val = DAG.getConstant(Value, dl, BlockType);
+  }
 
+  SDValue RepStos =
+      emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
+                  DAG.getIntPtrConstant(BlockCount, dl), BlockType);
   /// RepStos can process the whole length.
   if (BytesLeft == 0)
     return RepStos;
@@ -137,21 +182,45 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   // Handle the last 1 - 7 bytes.
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepStos);
-  unsigned Offset = SizeVal - BytesLeft;
+  unsigned Offset = Size - BytesLeft;
   EVT AddrVT = Dst.getValueType();
-  EVT SizeVT = Size.getValueType();
 
   Results.push_back(
       DAG.getMemset(Chain, dl,
                     DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                 DAG.getConstant(Offset, dl, AddrVT)),
-                    Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
+                    ValRaw, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
                     isVolatile, AlwaysInline,
                     /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset)));
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
 }
 
+SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo) const {
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If the base register might conflict with our physical registers, bail out.
+  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+                                  X86::ECX, X86::EAX, X86::EDI};
+  if (isBaseRegConflictPossible(DAG, ClobberSet))
+    return SDValue();
+
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+
+  const X86Subtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
+  return emitConstantSizeRepstos(
+      DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
+      Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
+}
+
 /// Emit a single REP MOVS{B,W,D,Q} instruction.
 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +251,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(Size, dl), MVT::i8);
 }
 
-/// Returns the best type to use with repmovs depending on alignment.
-static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
-                                 Align Alignment) {
-  uint64_t Align = Alignment.value();
-  assert((Align != 0) && "Align is normalized");
-  assert(isPowerOf2_64(Align) && "Align is a power of 2");
-  switch (Align) {
-  case 1:
-    return MVT::i8;
-  case 2:
-    return MVT::i16;
-  case 4:
-    return MVT::i32;
-  default:
-    return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
-  }
-}
-
 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
 /// a constant size memory copy. In some cases where we know REP MOVS is
 /// inefficient we return an empty SDValue so the calling code can either
@@ -209,6 +260,10 @@ static SDValue emitConstantSizeRepmov(
     SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
     Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
+  /// In case we optimize for size, we use repmovsb even if it's less efficient
+  /// so we can save the loads/stores of the leftover.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
 
   /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
   /// efficient.
@@ -222,10 +277,10 @@ static SDValue emitConstantSizeRepmov(
   assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
   /// We assume runtime memcpy will do a better job for unaligned copies when
   /// ERMS is not present.
-  if (!AlwaysInline && (Alignment.value() & 3) != 0)
+  if (!AlwaysInline && (Alignment < Align(4)))
     return SDValue();
 
-  const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
+  const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
   const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
   const uint64_t BlockCount = Size / BlockBytes;
   const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,11 +294,6 @@ static SDValue emitConstantSizeRepmov(
 
   assert(BytesLeft && "We have leftover at this point");
 
-  /// In case we optimize for size we use repmovsb even if it's less efficient
-  /// so we can save the loads/stores of the leftover.
-  if (DAG.getMachineFunction().getFunction().hasMinSize())
-    return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
-
   // Handle the last 1 - 7 bytes.
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
@@ -282,7 +332,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   if (UseFSRMForMemcpy && Subtarget.hasFSRM())
     return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
 
-  /// Handle constant sizes,
+  /// Handle constant sizes
   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
     return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
                                   ConstantSize->getZExtValue(),
diff --git a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
index 8bc4098b0f7c60..f6b1e487000976 100644
--- a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
+++ b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
@@ -78,9 +78,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
 ; NOFAST32-NEXT:    pushl %esi
 ; NOFAST32-NEXT:    subl $4100, %esp # imm = 0x1004
 ; NOFAST32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; NOFAST32-NEXT:    movl $1024, %ecx # imm = 0x400
+; NOFAST32-NEXT:    movl $4096, %ecx # imm = 0x1000
 ; NOFAST32-NEXT:    movl %esp, %edi
-; NOFAST32-NEXT:    rep;movsl (%esi), %es:(%edi)
+; NOFAST32-NEXT:    rep;movsb (%esi), %es:(%edi)
 ; NOFAST32-NEXT:    calll foo@PLT
 ; NOFAST32-NEXT:    addl $4100, %esp # imm = 0x1004
 ; NOFAST32-NEXT:    popl %esi
@@ -106,9 +106,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
 ; NOFAST:       # %bb.0:
 ; NOFAST-NEXT:    subq $4104, %rsp # imm = 0x1008
 ; NOFAST-NEXT:    movq %rdi, %rsi
-; NOFAST-NEXT:    movl $512, %ecx # imm = 0x200
+; NOFAST-NEXT:    movl $4096, %ecx # imm = 0x1000
 ; NOFAST-NEXT:    movq %rsp, %rdi
-; NOFAST-NEXT:    rep;movsq (%rsi), %es:(%rdi)
+; NOFAST-NEXT:    rep;movsb (%rsi), %es:(%rdi)
 ; NOFAST-NEXT:    callq foo@PLT
 ; NOFAST-NEXT:    addq $4104, %rsp # imm = 0x1008
 ; NOFAST-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll
index 6ec9b20163051b..ff026b142ecf3c 100644
--- a/llvm/test/CodeGen/X86/memcpy.ll
+++ b/llvm/test/CodeGen/X86/memcpy.ll
@@ -202,14 +202,16 @@ define void @test3_minsize(ptr nocapture %A, ptr nocapture %B) nounwind minsize
 ; DARWIN-LABEL: test3_minsize:
 ; DARWIN:       ## %bb.0:
 ; DARWIN-NEXT:    pushq $64
-; DARWIN-NEXT:    popq %rdx
-; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
+; DARWIN-NEXT:    popq %rcx
+; DARWIN-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; DARWIN-NEXT:    retq
 ;
 ; LINUX-LABEL: test3_minsize:
 ; LINUX:       # %bb.0:
 ; LINUX-NEXT:    pushq $64
-; LINUX-NEXT:    popq %rdx
-; LINUX-NEXT:    jmp memcpy@PLT # TAILCALL
+; LINUX-NEXT:    popq %rcx
+; LINUX-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; LINUX-NEXT:    retq
 ;
 ; LINUX-SKL-LABEL: test3_minsize:
 ; LINUX-SKL:       # %bb.0:
@@ -249,14 +251,16 @@ define void @test3_minsize_optsize(ptr nocapture %A, ptr nocapture %B) nounwind
 ; DARWIN-LABEL: test3_minsize_optsize:
 ; DARWIN:       ## %bb.0:
 ; DARWIN-NEXT:    pushq $64
-; DARWIN-NEXT:    popq %rdx
-; DARWIN-NEXT:    jmp _memcpy ## TAILCALL
+; DARWIN-NEXT:    popq %rcx
+; DARWIN-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; DARWIN-NEXT:    retq
 ;
 ; LINUX-LABEL: test3_minsize_optsize:
 ; LINUX:       # %bb.0:
 ; LINUX-NEXT:    pushq $64
-; LINUX-NEXT:    popq %rdx
-; LINUX-NEXT:    jmp memcpy@PLT # TAILCALL
+; LINUX-NEXT:    popq %rcx
+; LINUX-NEXT:    rep;movsb (%rsi), %es:(%rdi)
+; LINUX-NEXT:    retq
 ;
 ; LINUX-SKL-LABEL: test3_minsize_optsize:
 ; LINUX-SKL:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/memset-minsize.ll b/llvm/test/CodeGen/X86/memset-minsize.ll
index cc0f2156262bba..d66500ea31a0d6 100644
--- a/llvm/test/CodeGen/X86/memset-minsize.ll
+++ b/llvm/test/CodeGen/X86/memset-minsize.ll
@@ -27,11 +27,9 @@ entry:
 define void @medium_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: medium_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $512, %edx # imm = 0x200
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $128, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 512, i1 false)
@@ -41,11 +39,9 @@ entry:
 define void @large_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: large_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $4096, %edx # imm = 0x1000
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $1024, %ecx # imm = 0x400
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 4096, i1 false)
@@ -55,11 +51,9 @@ entry:
 define void @huge_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: huge_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $8192, %edx # imm = 0x2000
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $2048, %ecx # imm = 0x800
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 8192, i1 false)
@@ -69,11 +63,9 @@ entry:
 define void @odd_length_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: odd_length_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $255, %edx
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $255, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosb %al, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 255, i1 false)
@@ -83,11 +75,10 @@ entry:
 define void @align_1_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: align_1_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256, %edx # imm = 0x100
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    pushq $64
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0.i32(ptr align 1 %ptr, i8 0, i32 256, i1 false)
@@ -97,11 +88,10 @@ entry:
 define void @align_2_memset_to_rep_stos(ptr %ptr) minsize nounwind {
 ; CHECK-LABEL: align_2_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256,...
[truncated]

…esses (llvm#87003)"

Restore old Val if bytes are left over to prevent an assertion failure.
@AZero13
Copy link
Contributor Author

AZero13 commented Oct 8, 2024

@phoebewang Thoughts?

@phoebewang phoebewang merged commit adc6a9e into llvm:main Oct 8, 2024
8 checks passed
@AZero13 AZero13 deleted the memset2 branch October 8, 2024 16:37
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants