From 72b0f4b88059d71a3ec8b524f719ec0ec228e589 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 12 Mar 2025 11:17:47 +0530 Subject: [PATCH 1/4] [X86] Update the value of base and index of masked gather for better codegen on-behalf-of: @AMD Rohit.Aggarwal@amd.com --- llvm/include/llvm/CodeGen/TargetLowering.h | 7 + .../SelectionDAG/SelectionDAGBuilder.cpp | 5 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 8 + llvm/lib/Target/X86/X86ISelLowering.cpp | 107 ++++++++ llvm/lib/Target/X86/X86ISelLowering.h | 7 + llvm/test/CodeGen/X86/gatherBaseIndexFix.ll | 249 ++++++++++++++++++ 6 files changed, 383 insertions(+) create mode 100644 llvm/test/CodeGen/X86/gatherBaseIndexFix.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 2089d47e9cbc8..46b28b7f5813d 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5127,6 +5127,13 @@ class TargetLowering : public TargetLoweringBase { SmallVectorImpl &Ops, SelectionDAG &DAG) const; + // Target may override this function to decided whether it want to update the + // base and index value of a non-uniform gep + virtual bool updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, SelectionDAG &DAG, + const BasicBlock *CurBB) const; + //===--------------------------------------------------------------------===// // Div utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 14bb1d943d2d6..3cbc59cc7387c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4905,6 +4905,11 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) { Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } + if (!UniformBase) { + TLI.updateBaseAndIndex(Ptr, Base, Index, getCurSDLoc(), getValue(Ptr), DAG, + I.getParent()); + } + EVT IdxVT = Index.getValueType(); EVT EltTy = IdxVT.getVectorElementType(); if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index bd72718c49031..eb2ac6044cb6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5655,6 +5655,14 @@ void TargetLowering::CollectTargetIntrinsicOperands( const CallInst &I, SmallVectorImpl &Ops, SelectionDAG &DAG) const { } +// By default, this function is disabled. Overriding target can enable it +bool TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, SelectionDAG &DAG, + const BasicBlock *CurBB) const { + return false; +} + std::pair TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, StringRef Constraint, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 24e5d8bfc404c..9395988cf2177 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -126,6 +126,11 @@ static cl::opt MulConstantOptimization( "SHIFT, LEA, etc."), cl::Hidden); +static cl::opt + EnableBaseIndexUpdate("update-baseIndex", cl::init(true), + cl::desc("Update the value of base and index"), + cl::Hidden); + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -61619,3 +61624,105 @@ Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); return TargetLowering::getPrefLoopAlignment(); } + +// Target override this function to decided whether it want to update the base +// and index value of a non-uniform gep +bool X86TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, + SelectionDAG &DAG, + const BasicBlock *CurBB) const { + if (!EnableBaseIndexUpdate) + return false; + + const GetElementPtrInst *GEP = dyn_cast(Ptr); + if (GEP && GEP->getParent() != CurBB) + return false; + + SDValue nbase; + bool valid = true; + /* For the gep instruction, we are trying to properly assign the base and + index value We are go through the lower code and iterate backward. + */ + if (Gep.getOpcode() == ISD::ADD) { + SDValue Op0 = Gep.getOperand(0); // base or add + SDValue Op1 = Gep.getOperand(1); // build vector or SHL + nbase = Op0; + SDValue Idx = Op1; + auto Flags = Gep->getFlags(); + + if (Op0->getOpcode() == ISD::ADD) { // add t15(base), t18(Idx) + SDValue Op00 = Op0.getOperand(0); // Base + nbase = Op00; + Idx = Op0.getOperand(1); + } else if (!(Op0->getOpcode() == ISD::BUILD_VECTOR && + Op0.getOperand(0).getOpcode() == ISD::CopyFromReg)) { + return false; + } + SDValue nIndex; + if (Idx.getOpcode() == ISD::SHL) { // shl zext, BV + SDValue Op10 = Idx.getOperand(0); // Zext or Sext value + SDValue Op11 = Idx.getOperand(1); // Build vector of constant + + unsigned IndexWidth = Op10.getScalarValueSizeInBits(); + if ((Op10.getOpcode() == ISD::SIGN_EXTEND || + Op10.getOpcode() == ISD::ZERO_EXTEND) && + IndexWidth > 32 && + Op10.getOperand(0).getScalarValueSizeInBits() <= 32 && + DAG.ComputeNumSignBits(Op10) > (IndexWidth - 32) && + Op11.getOpcode() == ISD::BUILD_VECTOR) { + + KnownBits ExtKnown = DAG.computeKnownBits(Op10); + bool ExtIsNonNegative = ExtKnown.isNonNegative(); + KnownBits ExtOpKnown = DAG.computeKnownBits(Op10.getOperand(0)); + bool ExtOpIsNonNegative = ExtOpKnown.isNonNegative(); + if (!(ExtIsNonNegative && ExtOpIsNonNegative)) + return false; + + SDValue newOp10 = + Op10.getOperand(0); // Get the Operand zero from the ext + EVT VT = newOp10.getValueType(); // Use the + + auto *ConstEltNo = dyn_cast(Op11.getOperand(0)); + if (!ConstEltNo) { + return false; + } + SmallVector Ops(VT.getVectorNumElements(), + DAG.getConstant(ConstEltNo->getZExtValue(), + DL, VT.getScalarType())); + nIndex = DAG.getNode(ISD::SHL, DL, VT, newOp10, + DAG.getBuildVector(VT, DL, Ops)); + } else { + return false; + } + } else { + return false; + } + if (Op0 != nbase) { + LLVM_DEBUG(Op1.dump()); + LLVM_DEBUG(dbgs() << "nIndex.getValueType()" << nIndex.getValueType() + << "\n"; + nIndex.dump()); + // SmallVector Ops(Op1->op_values()); + auto *ConstEltNo = dyn_cast(Op1.getOperand(0)); + if (!ConstEltNo) { + return false; + } + SmallVector Ops( + nIndex.getValueType().getVectorNumElements(), + DAG.getConstant(ConstEltNo->getZExtValue(), DL, + nIndex.getValueType().getScalarType())); + nIndex = DAG.getNode(ISD::ADD, DL, nIndex.getValueType(), nIndex, + DAG.getBuildVector(nIndex.getValueType(), DL, Ops), + Flags); + } + Base = nbase.getOperand(0); + Index = nIndex; + LLVM_DEBUG(dbgs() << "Successfull in updating the non uniform gep " + "information\n"; + dbgs() << "updated base "; Base.dump(); + dbgs() << "updated Index "; Index.dump();); + return true; + } + return false; +} diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 4a2b35e9efe7c..c092055329c58 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1671,6 +1671,13 @@ namespace llvm { return TargetLoweringBase::getTypeToTransformTo(Context, VT); } + // Target override this function to decided whether it want to update the + // base and index value of a non-uniform gep + bool updateBaseAndIndex(const Value *Ptr, SDValue &Base, SDValue &Index, + const SDLoc &DL, const SDValue &Gep, + SelectionDAG &DAG, + const BasicBlock *CurBB) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll new file mode 100644 index 0000000000000..9a45e5e2d9c82 --- /dev/null +++ b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll @@ -0,0 +1,249 @@ +; RUN: llc -enable-masked-gather-sequence=false -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s +; RUN: llc -update-baseIndex -enable-masked-gather-sequence=false -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s +; RUN: llc -update-baseIndex=false -enable-masked-gather-sequence=false -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s -check-prefix=OLD + +; ModuleID = 'qwdemo.c' +source_filename = "qwdemo.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.pt = type { float, float, float, i32 } + +; Function Attrs: nofree nosync nounwind memory(argmem: readwrite) uwtable +define dso_local i32 @foo(float noundef %cut_coulsq, ptr noalias nocapture noundef readonly %jlist, i32 noundef %jnum, ptr noalias nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %trsq, ptr noalias nocapture noundef writeonly %tdelx, ptr noalias nocapture noundef writeonly %tdely, ptr noalias nocapture noundef writeonly %tdelz, ptr noalias nocapture noundef writeonly %tjtype, ptr noalias nocapture noundef writeonly %tj, ptr noalias nocapture noundef readnone %tx, ptr noalias nocapture noundef readnone %ty, ptr noalias nocapture noundef readnone %tz) local_unnamed_addr #0 { +entry: + %0 = load float, ptr %x, align 4, !tbaa !5 + %y = getelementptr inbounds %struct.pt, ptr %x, i64 0, i32 1 + %1 = load float, ptr %y, align 4, !tbaa !11 + %z = getelementptr inbounds %struct.pt, ptr %x, i64 0, i32 2 + %2 = load float, ptr %z, align 4, !tbaa !12 + %cmp62 = icmp sgt i32 %jnum, 0 + br i1 %cmp62, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %jnum to i64 + %min.iters.check = icmp ult i32 %jnum, 16 + br i1 %min.iters.check, label %for.body.preheader75, label %vector.ph + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 4294967280 + %broadcast.splatinsert = insertelement <16 x float> poison, float %0, i64 0 + %broadcast.splat = shufflevector <16 x float> %broadcast.splatinsert, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert67 = insertelement <16 x float> poison, float %1, i64 0 + %broadcast.splat68 = shufflevector <16 x float> %broadcast.splatinsert67, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert70 = insertelement <16 x float> poison, float %2, i64 0 + %broadcast.splat71 = shufflevector <16 x float> %broadcast.splatinsert70, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert72 = insertelement <16 x float> poison, float %cut_coulsq, i64 0 + %broadcast.splat73 = shufflevector <16 x float> %broadcast.splatinsert72, <16 x float> poison, <16 x i32> zeroinitializer + br label %vector.body + +; CHECK-LABEL: .LBB0_6: +; CHECK: vmovdqu64 (%rdi,%r13), %zmm11 +; CHECK-NEXT: vxorps %xmm13, %xmm13, %xmm13 +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; CHECK-NEXT: vxorps %xmm16, %xmm16, %xmm16 +; CHECK-NEXT: cltq +; CHECK-NEXT: vpslld $4, %zmm11, %zmm12 +; CHECK-NEXT: vgatherdps (%rdx,%zmm12), %zmm13 {%k1} +; CHECK-NEXT: vorps %zmm8, %zmm12, %zmm14 +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vorps %zmm9, %zmm12, %zmm17 +; CHECK-NEXT: vpord %zmm10, %zmm12, %zmm12 +; CHECK-NEXT: vgatherdps (%rdx,%zmm14), %zmm15 {%k1} +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherdps (%rdx,%zmm17), %zmm16 {%k1} + +; OLD-LABEL: .LBB0_6: + +; OLD: vmovdqu64 (%rdi,%r13), %zmm10 +; OLD-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vxorps %xmm16, %xmm16, %xmm16 +; OLD-NEXT: cltq +; OLD-NEXT: vpandd %zmm8, %zmm10, %zmm11 +; OLD-NEXT: vextracti64x4 $1, %zmm11, %ymm12 +; OLD-NEXT: vpmovzxdq %ymm11, %zmm11 # zmm11 = ymm11[0],zero,ymm11[1],zero,ymm11[2],zero,ymm11[3],zero,ymm11[4],zero,ymm11[5],zero,ymm11[6],zero,ymm11[7],zero +; OLD-NEXT: vpmovzxdq %ymm12, %zmm12 # zmm12 = ymm12[0],zero,ymm12[1],zero,ymm12[2],zero,ymm12[3],zero,ymm12[4],zero,ymm12[5],zero,ymm12[6],zero,ymm12[7],zero +; OLD-NEXT: vpsllq $4, %zmm11, %zmm11 +; OLD-NEXT: vpsllq $4, %zmm12, %zmm12 +; OLD-NEXT: vpaddq %zmm11, %zmm9, %zmm13 +; OLD-NEXT: vgatherqps (%rdx,%zmm12), %ymm15 {%k1} +; OLD-NEXT: vpaddq %zmm12, %zmm9, %zmm14 +; OLD-NEXT: vxorps %xmm12, %xmm12, %xmm12 +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vgatherqps (%rdx,%zmm11), %ymm12 {%k1} +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vinsertf64x4 $1, %ymm15, %zmm12, %zmm11 +; OLD-NEXT: vxorps %xmm12, %xmm12, %xmm12 +; OLD-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; OLD-NEXT: vgatherqps 4(,%zmm14), %ymm12 {%k1} +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vgatherqps 4(,%zmm13), %ymm15 {%k1} +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vsubps %zmm11, %zmm4, %zmm11 +; OLD-NEXT: vinsertf64x4 $1, %ymm12, %zmm15, %zmm12 +; OLD-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; OLD-NEXT: vgatherqps 8(,%zmm14), %ymm15 {%k1} +; OLD-NEXT: kxnorw %k0, %k0, %k1 +; OLD-NEXT: vsubps %zmm12, %zmm5, %zmm12 +; OLD-NEXT: vgatherqps 8(,%zmm13), %ymm16 {%k1} + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %pred.index = phi i32 [ 0, %vector.ph ], [ %predphi, %vector.body ] + %3 = getelementptr inbounds i32, ptr %jlist, i64 %index + %wide.load = load <16 x i32>, ptr %3, align 4, !tbaa !13 + %4 = and <16 x i32> %wide.load, + %5 = zext <16 x i32> %4 to <16 x i64> + %6 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5 + %wide.masked.gather = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %6, i32 4, <16 x i1> , <16 x float> poison), !tbaa !5 + %7 = fsub <16 x float> %broadcast.splat, %wide.masked.gather + %8 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 1 + %wide.masked.gather66 = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %8, i32 4, <16 x i1> , <16 x float> poison), !tbaa !11 + %9 = fsub <16 x float> %broadcast.splat68, %wide.masked.gather66 + %10 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 2 + %wide.masked.gather69 = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %10, i32 4, <16 x i1> , <16 x float> poison), !tbaa !12 + %11 = fsub <16 x float> %broadcast.splat71, %wide.masked.gather69 + %12 = fmul <16 x float> %9, %9 + %13 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %7, <16 x float> %7, <16 x float> %12) + %14 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> %11, <16 x float> %13) + %15 = fcmp olt <16 x float> %14, %broadcast.splat73 + %16 = sext i32 %pred.index to i64 + %17 = getelementptr float, ptr %trsq, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %14, ptr %17, <16 x i1> %15), !tbaa !14 + %18 = getelementptr float, ptr %tdelx, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %7, ptr %18, <16 x i1> %15), !tbaa !14 + %19 = getelementptr float, ptr %tdely, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %9, ptr %19, <16 x i1> %15), !tbaa !14 + %20 = getelementptr float, ptr %tdelz, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %11, ptr %20, <16 x i1> %15), !tbaa !14 + %21 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 3 + %wide.masked.gather74 = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %21, i32 4, <16 x i1> %15, <16 x i32> poison), !tbaa !15 + %22 = getelementptr i32, ptr %tjtype, i64 %16 + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %wide.masked.gather74, ptr %22, <16 x i1> %15), !tbaa !13 + %23 = getelementptr i32, ptr %tj, i64 %16 + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %wide.load, ptr %23, <16 x i1> %15), !tbaa !13 + %24 = bitcast <16 x i1> %15 to i16 + %mask.popcnt = tail call i16 @llvm.ctpop.i16(i16 %24), !range !16 + %popcnt.cmp.not = icmp eq i16 %24, 0 + %narrow = select i1 %popcnt.cmp.not, i16 0, i16 %mask.popcnt + %popcnt.inc = zext i16 %narrow to i32 + %predphi = add i32 %pred.index, %popcnt.inc + %index.next = add nuw i64 %index, 16 + %25 = icmp eq i64 %index.next, %n.vec + br i1 %25, label %middle.block, label %vector.body, !llvm.loop !17 + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader75 + +for.body.preheader75: ; preds = %for.body.preheader, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %ej.064.ph = phi i32 [ 0, %for.body.preheader ], [ %predphi, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %if.end, %middle.block, %entry + %ej.0.lcssa = phi i32 [ 0, %entry ], [ %predphi, %middle.block ], [ %ej.1, %if.end ] + ret i32 %ej.0.lcssa + +for.body: ; preds = %for.body.preheader75, %if.end + %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ %indvars.iv.ph, %for.body.preheader75 ] + %ej.064 = phi i32 [ %ej.1, %if.end ], [ %ej.064.ph, %for.body.preheader75 ] + %arrayidx4 = getelementptr inbounds i32, ptr %jlist, i64 %indvars.iv + %26 = load i32, ptr %arrayidx4, align 4, !tbaa !13 + %and = and i32 %26, 536870911 + %idxprom5 = zext i32 %and to i64 + %arrayidx6 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5 + %27 = load float, ptr %arrayidx6, align 4, !tbaa !5 + %sub = fsub float %0, %27 + %y10 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 1 + %28 = load float, ptr %y10, align 4, !tbaa !11 + %sub11 = fsub float %1, %28 + %z14 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 2 + %29 = load float, ptr %z14, align 4, !tbaa !12 + %sub15 = fsub float %2, %29 + %mul16 = fmul float %sub11, %sub11 + %30 = tail call float @llvm.fmuladd.f32(float %sub, float %sub, float %mul16) + %31 = tail call float @llvm.fmuladd.f32(float %sub15, float %sub15, float %30) + %cmp17 = fcmp olt float %31, %cut_coulsq + br i1 %cmp17, label %if.then, label %if.end + +if.then: ; preds = %for.body + %idxprom18 = sext i32 %ej.064 to i64 + %arrayidx19 = getelementptr inbounds float, ptr %trsq, i64 %idxprom18 + store float %31, ptr %arrayidx19, align 4, !tbaa !14 + %arrayidx21 = getelementptr inbounds float, ptr %tdelx, i64 %idxprom18 + store float %sub, ptr %arrayidx21, align 4, !tbaa !14 + %arrayidx23 = getelementptr inbounds float, ptr %tdely, i64 %idxprom18 + store float %sub11, ptr %arrayidx23, align 4, !tbaa !14 + %arrayidx25 = getelementptr inbounds float, ptr %tdelz, i64 %idxprom18 + store float %sub15, ptr %arrayidx25, align 4, !tbaa !14 + %w = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 3 + %32 = load i32, ptr %w, align 4, !tbaa !15 + %arrayidx29 = getelementptr inbounds i32, ptr %tjtype, i64 %idxprom18 + store i32 %32, ptr %arrayidx29, align 4, !tbaa !13 + %arrayidx33 = getelementptr inbounds i32, ptr %tj, i64 %idxprom18 + store i32 %26, ptr %arrayidx33, align 4, !tbaa !13 + %inc = add nsw i32 %ej.064, 1 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %ej.1 = phi i32 [ %inc, %if.then ], [ %ej.064, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !21 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.fmuladd.f32(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x float>) #2 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr nocapture, <16 x i1>) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i32>) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr nocapture, <16 x i1>) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i16 @llvm.ctpop.i16(i16) #3 + +attributes #0 = { nofree nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver5" "target-features"="+adx,+aes,+avx,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vp2intersect,+avx512vpopcntdq,+avxvnni,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+invpcid,+lzcnt,+mmx,+movbe,+movdir64b,+movdiri,+mwaitx,+pclmul,+pku,+popcnt,+prefetchi,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+shstk,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 17.0.6 (CLANG: Unknown-Revision)"} +!5 = !{!6, !7, i64 0} +!6 = !{!"pt", !7, i64 0, !7, i64 4, !7, i64 8, !10, i64 12} +!7 = !{!"float", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +!10 = !{!"int", !8, i64 0} +!11 = !{!6, !7, i64 4} +!12 = !{!6, !7, i64 8} +!13 = !{!10, !10, i64 0} +!14 = !{!7, !7, i64 0} +!15 = !{!6, !10, i64 12} +!16 = !{i16 0, i16 17} +!17 = distinct !{!17, !18, !19, !20} +!18 = !{!"llvm.loop.mustprogress"} +!19 = !{!"llvm.loop.isvectorized", i32 1} +!20 = !{!"llvm.loop.unroll.runtime.disable"} +!21 = distinct !{!21, !18, !20, !19} From a231c963f4dbbdbc28e69c15d9a782c11a8093b2 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 12 Mar 2025 11:17:47 +0530 Subject: [PATCH 2/4] [X86] Update the value of base and index of masked gather for better codegen on-behalf-of: @AMD Rohit.Aggarwal@amd.com --- llvm/include/llvm/CodeGen/TargetLowering.h | 7 + .../SelectionDAG/SelectionDAGBuilder.cpp | 5 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 8 + llvm/lib/Target/X86/X86ISelLowering.cpp | 100 +++++++++ llvm/lib/Target/X86/X86ISelLowering.h | 7 + llvm/test/CodeGen/X86/gatherBaseIndexFix.ll | 211 ++++++++++++++++++ 6 files changed, 338 insertions(+) create mode 100644 llvm/test/CodeGen/X86/gatherBaseIndexFix.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 2089d47e9cbc8..46b28b7f5813d 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5127,6 +5127,13 @@ class TargetLowering : public TargetLoweringBase { SmallVectorImpl &Ops, SelectionDAG &DAG) const; + // Target may override this function to decided whether it want to update the + // base and index value of a non-uniform gep + virtual bool updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, SelectionDAG &DAG, + const BasicBlock *CurBB) const; + //===--------------------------------------------------------------------===// // Div utility functions // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 14bb1d943d2d6..6cc4864008c8e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5024,6 +5024,11 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } + if (!UniformBase) { + TLI.updateBaseAndIndex(Ptr, Base, Index, getCurSDLoc(), getValue(Ptr), DAG, + I.getParent()); + } + EVT IdxVT = Index.getValueType(); EVT EltTy = IdxVT.getVectorElementType(); if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index bd72718c49031..eb2ac6044cb6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5655,6 +5655,14 @@ void TargetLowering::CollectTargetIntrinsicOperands( const CallInst &I, SmallVectorImpl &Ops, SelectionDAG &DAG) const { } +// By default, this function is disabled. Overriding target can enable it +bool TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, SelectionDAG &DAG, + const BasicBlock *CurBB) const { + return false; +} + std::pair TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI, StringRef Constraint, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 24e5d8bfc404c..6c1d279d20a43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -126,6 +126,11 @@ static cl::opt MulConstantOptimization( "SHIFT, LEA, etc."), cl::Hidden); +static cl::opt + EnableBaseIndexUpdate("update-baseIndex", cl::init(true), + cl::desc("Update the value of base and index"), + cl::Hidden); + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -61619,3 +61624,98 @@ Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); return TargetLowering::getPrefLoopAlignment(); } + +// Target override this function to decided whether it want to update the base +// and index value of a non-uniform gep +bool X86TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base, + SDValue &Index, const SDLoc &DL, + const SDValue &Gep, + SelectionDAG &DAG, + const BasicBlock *CurBB) const { + if (!EnableBaseIndexUpdate) + return false; + + const GetElementPtrInst *GEP = dyn_cast(Ptr); + if (GEP && GEP->getParent() != CurBB) + return false; + + SDValue nbase; + /* For the gep instruction, we are trying to properly assign the base and + index value We are go through the lower code and iterate backward. + */ + if (Gep.getOpcode() == ISD::ADD) { + SDValue Op0 = Gep.getOperand(0); // base or add + SDValue Op1 = Gep.getOperand(1); // build vector or SHL + nbase = Op0; + SDValue Idx = Op1; + auto Flags = Gep->getFlags(); + + if (Op0->getOpcode() == ISD::ADD) { // add t15(base), t18(Idx) + SDValue Op00 = Op0.getOperand(0); // Base + nbase = Op00; + Idx = Op0.getOperand(1); + } else if (!(Op0->getOpcode() == ISD::BUILD_VECTOR && + Op0.getOperand(0).getOpcode() == ISD::CopyFromReg)) { + return false; + } + SDValue nIndex; + if (Idx.getOpcode() == ISD::SHL) { // shl zext, BV + SDValue Op10 = Idx.getOperand(0); // Zext or Sext value + SDValue Op11 = Idx.getOperand(1); // Build vector of constant + + unsigned IndexWidth = Op10.getScalarValueSizeInBits(); + if ((Op10.getOpcode() == ISD::SIGN_EXTEND || + Op10.getOpcode() == ISD::ZERO_EXTEND) && + IndexWidth > 32 && + Op10.getOperand(0).getScalarValueSizeInBits() <= 32 && + DAG.ComputeNumSignBits(Op10) > (IndexWidth - 32) && + Op11.getOpcode() == ISD::BUILD_VECTOR) { + KnownBits ExtKnown = DAG.computeKnownBits(Op10); + bool ExtIsNonNegative = ExtKnown.isNonNegative(); + KnownBits ExtOpKnown = DAG.computeKnownBits(Op10.getOperand(0)); + bool ExtOpIsNonNegative = ExtOpKnown.isNonNegative(); + if (!(ExtIsNonNegative && ExtOpIsNonNegative)) + return false; + + SDValue newOp10 = + Op10.getOperand(0); // Get the Operand zero from the ext + EVT VT = newOp10.getValueType(); // Use the + + auto *ConstEltNo = dyn_cast(Op11.getOperand(0)); + if (!ConstEltNo) { + return false; + } + SmallVector Ops(VT.getVectorNumElements(), + DAG.getConstant(ConstEltNo->getZExtValue(), + DL, VT.getScalarType())); + nIndex = DAG.getNode(ISD::SHL, DL, VT, newOp10, + DAG.getBuildVector(VT, DL, Ops)); + } else { + return false; + } + } else { + return false; + } + if (Op0 != nbase) { + auto *ConstEltNo = dyn_cast(Op1.getOperand(0)); + if (!ConstEltNo) { + return false; + } + SmallVector Ops( + nIndex.getValueType().getVectorNumElements(), + DAG.getConstant(ConstEltNo->getZExtValue(), DL, + nIndex.getValueType().getScalarType())); + nIndex = DAG.getNode(ISD::ADD, DL, nIndex.getValueType(), nIndex, + DAG.getBuildVector(nIndex.getValueType(), DL, Ops), + Flags); + } + Base = nbase.getOperand(0); + Index = nIndex; + LLVM_DEBUG(dbgs() << "Successfull in updating the non uniform gep " + "information\n"; + dbgs() << "updated base "; Base.dump(); + dbgs() << "updated Index "; Index.dump();); + return true; + } + return false; +} diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 4a2b35e9efe7c..c092055329c58 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1671,6 +1671,13 @@ namespace llvm { return TargetLoweringBase::getTypeToTransformTo(Context, VT); } + // Target override this function to decided whether it want to update the + // base and index value of a non-uniform gep + bool updateBaseAndIndex(const Value *Ptr, SDValue &Base, SDValue &Index, + const SDLoc &DL, const SDValue &Gep, + SelectionDAG &DAG, + const BasicBlock *CurBB) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll new file mode 100644 index 0000000000000..95b74c5783e15 --- /dev/null +++ b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll @@ -0,0 +1,211 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s +; RUN: llc -update-baseIndex -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s +; RUN: llc -update-baseIndex=false -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s -check-prefix=OLD + +; ModuleID = 'qwdemo.c' +source_filename = "qwdemo.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.pt = type { float, float, float, i32 } + +; Function Attrs: nofree nosync nounwind memory(argmem: readwrite) uwtable +define dso_local i32 @foo(float noundef %cut_coulsq, ptr noalias nocapture noundef readonly %jlist, i32 noundef %jnum, ptr noalias nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %trsq, ptr noalias nocapture noundef writeonly %tdelx, ptr noalias nocapture noundef writeonly %tdely, ptr noalias nocapture noundef writeonly %tdelz, ptr noalias nocapture noundef writeonly %tjtype, ptr noalias nocapture noundef writeonly %tj, ptr noalias nocapture noundef readnone %tx, ptr noalias nocapture noundef readnone %ty, ptr noalias nocapture noundef readnone %tz) local_unnamed_addr #0 { +entry: + %0 = load float, ptr %x, align 4, !tbaa !5 + %y = getelementptr inbounds %struct.pt, ptr %x, i64 0, i32 1 + %1 = load float, ptr %y, align 4, !tbaa !11 + %z = getelementptr inbounds %struct.pt, ptr %x, i64 0, i32 2 + %2 = load float, ptr %z, align 4, !tbaa !12 + %cmp62 = icmp sgt i32 %jnum, 0 + br i1 %cmp62, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %jnum to i64 + %min.iters.check = icmp ult i32 %jnum, 16 + br i1 %min.iters.check, label %for.body.preheader75, label %vector.ph + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 4294967280 + %broadcast.splatinsert = insertelement <16 x float> poison, float %0, i64 0 + %broadcast.splat = shufflevector <16 x float> %broadcast.splatinsert, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert67 = insertelement <16 x float> poison, float %1, i64 0 + %broadcast.splat68 = shufflevector <16 x float> %broadcast.splatinsert67, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert70 = insertelement <16 x float> poison, float %2, i64 0 + %broadcast.splat71 = shufflevector <16 x float> %broadcast.splatinsert70, <16 x float> poison, <16 x i32> zeroinitializer + %broadcast.splatinsert72 = insertelement <16 x float> poison, float %cut_coulsq, i64 0 + %broadcast.splat73 = shufflevector <16 x float> %broadcast.splatinsert72, <16 x float> poison, <16 x i32> zeroinitializer + br label %vector.body + +; CHECK-LABEL: .LBB0_6: +; CHECK: vgatherdps (%rdx,%zmm12), %zmm13 {%k1} +; CHECK: vgatherdps (%rdx,%zmm14), %zmm15 {%k1} +; CHECK: vgatherdps (%rdx,%zmm17), %zmm16 {%k1} + +; OLD-LABEL: .LBB0_6: + +; OLD: vgatherqps (%rdx,%zmm12), %ymm15 {%k1} +; OLD: vgatherqps (%rdx,%zmm11), %ymm12 {%k1} +; OLD: vgatherqps 4(,%zmm14), %ymm12 {%k1} +; OLD: vgatherqps 4(,%zmm13), %ymm15 {%k1} +; OLD: vgatherqps 8(,%zmm14), %ymm15 {%k1} +; OLD: vgatherqps 8(,%zmm13), %ymm16 {%k1} + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %pred.index = phi i32 [ 0, %vector.ph ], [ %predphi, %vector.body ] + %3 = getelementptr inbounds i32, ptr %jlist, i64 %index + %wide.load = load <16 x i32>, ptr %3, align 4, !tbaa !13 + %4 = and <16 x i32> %wide.load, + %5 = zext <16 x i32> %4 to <16 x i64> + %6 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5 + %wide.masked.gather = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %6, i32 4, <16 x i1> , <16 x float> poison), !tbaa !5 + %7 = fsub <16 x float> %broadcast.splat, %wide.masked.gather + %8 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 1 + %wide.masked.gather66 = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %8, i32 4, <16 x i1> , <16 x float> poison), !tbaa !11 + %9 = fsub <16 x float> %broadcast.splat68, %wide.masked.gather66 + %10 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 2 + %wide.masked.gather69 = tail call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %10, i32 4, <16 x i1> , <16 x float> poison), !tbaa !12 + %11 = fsub <16 x float> %broadcast.splat71, %wide.masked.gather69 + %12 = fmul <16 x float> %9, %9 + %13 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %7, <16 x float> %7, <16 x float> %12) + %14 = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %11, <16 x float> %11, <16 x float> %13) + %15 = fcmp olt <16 x float> %14, %broadcast.splat73 + %16 = sext i32 %pred.index to i64 + %17 = getelementptr float, ptr %trsq, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %14, ptr %17, <16 x i1> %15), !tbaa !14 + %18 = getelementptr float, ptr %tdelx, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %7, ptr %18, <16 x i1> %15), !tbaa !14 + %19 = getelementptr float, ptr %tdely, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %9, ptr %19, <16 x i1> %15), !tbaa !14 + %20 = getelementptr float, ptr %tdelz, i64 %16 + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %11, ptr %20, <16 x i1> %15), !tbaa !14 + %21 = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 3 + %wide.masked.gather74 = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %21, i32 4, <16 x i1> %15, <16 x i32> poison), !tbaa !15 + %22 = getelementptr i32, ptr %tjtype, i64 %16 + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %wide.masked.gather74, ptr %22, <16 x i1> %15), !tbaa !13 + %23 = getelementptr i32, ptr %tj, i64 %16 + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %wide.load, ptr %23, <16 x i1> %15), !tbaa !13 + %24 = bitcast <16 x i1> %15 to i16 + %mask.popcnt = tail call i16 @llvm.ctpop.i16(i16 %24), !range !16 + %popcnt.cmp.not = icmp eq i16 %24, 0 + %narrow = select i1 %popcnt.cmp.not, i16 0, i16 %mask.popcnt + %popcnt.inc = zext i16 %narrow to i32 + %predphi = add i32 %pred.index, %popcnt.inc + %index.next = add nuw i64 %index, 16 + %25 = icmp eq i64 %index.next, %n.vec + br i1 %25, label %middle.block, label %vector.body, !llvm.loop !17 + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader75 + +for.body.preheader75: ; preds = %for.body.preheader, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %ej.064.ph = phi i32 [ 0, %for.body.preheader ], [ %predphi, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %if.end, %middle.block, %entry + %ej.0.lcssa = phi i32 [ 0, %entry ], [ %predphi, %middle.block ], [ %ej.1, %if.end ] + ret i32 %ej.0.lcssa + +for.body: ; preds = %for.body.preheader75, %if.end + %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ %indvars.iv.ph, %for.body.preheader75 ] + %ej.064 = phi i32 [ %ej.1, %if.end ], [ %ej.064.ph, %for.body.preheader75 ] + %arrayidx4 = getelementptr inbounds i32, ptr %jlist, i64 %indvars.iv + %26 = load i32, ptr %arrayidx4, align 4, !tbaa !13 + %and = and i32 %26, 536870911 + %idxprom5 = zext i32 %and to i64 + %arrayidx6 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5 + %27 = load float, ptr %arrayidx6, align 4, !tbaa !5 + %sub = fsub float %0, %27 + %y10 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 1 + %28 = load float, ptr %y10, align 4, !tbaa !11 + %sub11 = fsub float %1, %28 + %z14 = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 2 + %29 = load float, ptr %z14, align 4, !tbaa !12 + %sub15 = fsub float %2, %29 + %mul16 = fmul float %sub11, %sub11 + %30 = tail call float @llvm.fmuladd.f32(float %sub, float %sub, float %mul16) + %31 = tail call float @llvm.fmuladd.f32(float %sub15, float %sub15, float %30) + %cmp17 = fcmp olt float %31, %cut_coulsq + br i1 %cmp17, label %if.then, label %if.end + +if.then: ; preds = %for.body + %idxprom18 = sext i32 %ej.064 to i64 + %arrayidx19 = getelementptr inbounds float, ptr %trsq, i64 %idxprom18 + store float %31, ptr %arrayidx19, align 4, !tbaa !14 + %arrayidx21 = getelementptr inbounds float, ptr %tdelx, i64 %idxprom18 + store float %sub, ptr %arrayidx21, align 4, !tbaa !14 + %arrayidx23 = getelementptr inbounds float, ptr %tdely, i64 %idxprom18 + store float %sub11, ptr %arrayidx23, align 4, !tbaa !14 + %arrayidx25 = getelementptr inbounds float, ptr %tdelz, i64 %idxprom18 + store float %sub15, ptr %arrayidx25, align 4, !tbaa !14 + %w = getelementptr inbounds %struct.pt, ptr %x, i64 %idxprom5, i32 3 + %32 = load i32, ptr %w, align 4, !tbaa !15 + %arrayidx29 = getelementptr inbounds i32, ptr %tjtype, i64 %idxprom18 + store i32 %32, ptr %arrayidx29, align 4, !tbaa !13 + %arrayidx33 = getelementptr inbounds i32, ptr %tj, i64 %idxprom18 + store i32 %26, ptr %arrayidx33, align 4, !tbaa !13 + %inc = add nsw i32 %ej.064, 1 + br label %if.end + +if.end: ; preds = %if.then, %for.body + %ej.1 = phi i32 [ %inc, %if.then ], [ %ej.064, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !21 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.fmuladd.f32(float, float, float) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x float>) #2 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr nocapture, <16 x i1>) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i32>) #2 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr nocapture, <16 x i1>) #4 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i16 @llvm.ctpop.i16(i16) #3 + +attributes #0 = { nofree nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver5" "target-features"="+adx,+aes,+avx,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vp2intersect,+avx512vpopcntdq,+avxvnni,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+gfni,+invpcid,+lzcnt,+mmx,+movbe,+movdir64b,+movdiri,+mwaitx,+pclmul,+pku,+popcnt,+prefetchi,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+shstk,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) } +attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 17.0.6 (CLANG: Unknown-Revision)"} +!5 = !{!6, !7, i64 0} +!6 = !{!"pt", !7, i64 0, !7, i64 4, !7, i64 8, !10, i64 12} +!7 = !{!"float", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C/C++ TBAA"} +!10 = !{!"int", !8, i64 0} +!11 = !{!6, !7, i64 4} +!12 = !{!6, !7, i64 8} +!13 = !{!10, !10, i64 0} +!14 = !{!7, !7, i64 0} +!15 = !{!6, !10, i64 12} +!16 = !{i16 0, i16 17} +!17 = distinct !{!17, !18, !19, !20} +!18 = !{!"llvm.loop.mustprogress"} +!19 = !{!"llvm.loop.isvectorized", i32 1} +!20 = !{!"llvm.loop.unroll.runtime.disable"} +!21 = distinct !{!21, !18, !20, !19} From f516be22d55226d5c31d60dfdc42278bdf329468 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 16 Apr 2025 13:55:35 +0530 Subject: [PATCH 3/4] Update the masked_gather_scatter.ll --- .../test/CodeGen/X86/masked_gather_scatter.ll | 211 ++++-------------- 1 file changed, 40 insertions(+), 171 deletions(-) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 46e589b7b1be9..8bb2ae416b6b9 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4819,18 +4819,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index: @@ -4845,44 +4836,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index: ; X86-SKX: # %bb.0: @@ -4909,18 +4871,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset: @@ -4935,44 +4888,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X86-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0 +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1} +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset: ; X86-SKX: # %bb.0: @@ -4999,25 +4923,11 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-KNL-NEXT: kshiftrw $8, %k1, %k2 -; X64-KNL-NEXT: kmovw %k2, %k3 -; X64-KNL-NEXT: vmovaps %ymm4, %ymm0 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-KNL-NEXT: vmovaps %ymm1, %ymm5 -; X64-KNL-NEXT: kmovw %k1, %k3 -; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 +; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-KNL-NEXT: kmovw %k1, %k2 +; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 +; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} +; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test_gather_16f32_mask_index_pair: @@ -5034,58 +4944,17 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p ; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1} ; X86-KNL-NEXT: retl ; -; X64-SKX-SMALL-LABEL: test_gather_16f32_mask_index_pair: -; X64-SKX-SMALL: # %bb.0: -; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-SMALL-NEXT: kmovw %k2, %k3 -; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5 -; X64-SKX-SMALL-NEXT: kmovw %k1, %k3 -; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; X64-SKX-SMALL-NEXT: retq -; -; X64-SKX-LARGE-LABEL: test_gather_16f32_mask_index_pair: -; X64-SKX-LARGE: # %bb.0: -; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero -; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm3 -; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2 -; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4 -; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2 -; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0 -; X64-SKX-LARGE-NEXT: kmovw %k2, %k3 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3} -; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5 -; X64-SKX-LARGE-NEXT: kmovw %k1, %k3 -; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0 -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2} -; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1} -; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; X64-SKX-LARGE-NEXT: retq +; X64-SKX-LABEL: test_gather_16f32_mask_index_pair: +; X64-SKX: # %bb.0: +; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0 +; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0 +; X64-SKX-NEXT: vpmovd2m %zmm0, %k1 +; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm2 +; X64-SKX-NEXT: kmovw %k1, %k2 +; X64-SKX-NEXT: vmovaps %zmm1, %zmm0 +; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2} +; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} +; X64-SKX-NEXT: retq ; ; X86-SKX-LABEL: test_gather_16f32_mask_index_pair: ; X86-SKX: # %bb.0: From c2848c2cea0a237ce69ca5602570ad7cf67f7f55 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal Date: Wed, 16 Apr 2025 15:01:33 +0530 Subject: [PATCH 4/4] Remove redundant gatherBaseIndexFix.ll --- llvm/test/CodeGen/X86/gatherBaseIndexFix.ll | 38 --------------------- 1 file changed, 38 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/gatherBaseIndexFix.ll diff --git a/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll deleted file mode 100644 index 5983066c1f13f..0000000000000 --- a/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll +++ /dev/null @@ -1,38 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s - -%struct.pt = type { float, float, float, i32 } - -define <16 x float> @test_gather_16f32_1(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0) { -; CHECK-LABEL: test_gather_16f32_1: -; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $4, (%rsi), %zmm2 -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovb2m %xmm0, %k1 -; CHECK-NEXT: vgatherdps (%rdi,%zmm2), %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq - %wide.load = load <16 x i32>, ptr %arr, align 4 - %and = and <16 x i32> %wide.load, - %zext = zext <16 x i32> %and to <16 x i64> - %ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) - ret <16 x float> %res - } - -define <16 x float> @test_gather_16f32_2(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0) { -; CHECK-LABEL: test_gather_16f32_2: -; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $4, (%rsi), %zmm2 -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovb2m %xmm0, %k1 -; CHECK-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq - %wide.load = load <16 x i32>, ptr %arr, align 4 - %and = and <16 x i32> %wide.load, - %zext = zext <16 x i32> %and to <16 x i64> - %ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %zext, i32 1 - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) - ret <16 x float> %res - }