diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index f1e974f973cbe..a71e6bbb93638 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -63,8 +63,7 @@ class RISCVGatherScatterLowering : public FunctionPass { } private: - bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr, - Value *AlignOp); + bool tryCreateStridedLoadStore(IntrinsicInst *II); std::pair determineBaseAndStride(Instruction *Ptr, IRBuilderBase &Builder); @@ -483,12 +482,46 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr, return P; } -bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II, - Type *DataType, - Value *Ptr, - Value *AlignOp) { +bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) { + VectorType *DataType; + Value *StoreVal = nullptr, *Ptr, *Mask, *EVL = nullptr; + MaybeAlign MA; + switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: + DataType = cast(II->getType()); + Ptr = II->getArgOperand(0); + MA = cast(II->getArgOperand(1))->getMaybeAlignValue(); + Mask = II->getArgOperand(2); + break; + case Intrinsic::vp_gather: + DataType = cast(II->getType()); + Ptr = II->getArgOperand(0); + MA = II->getParamAlign(0).value_or( + DL->getABITypeAlign(DataType->getElementType())); + Mask = II->getArgOperand(1); + EVL = II->getArgOperand(2); + break; + case Intrinsic::masked_scatter: + DataType = cast(II->getArgOperand(0)->getType()); + StoreVal = II->getArgOperand(0); + Ptr = II->getArgOperand(1); + MA = cast(II->getArgOperand(2))->getMaybeAlignValue(); + Mask = II->getArgOperand(3); + break; + case Intrinsic::vp_scatter: + DataType = cast(II->getArgOperand(0)->getType()); + StoreVal = II->getArgOperand(0); + Ptr = II->getArgOperand(1); + MA = II->getParamAlign(1).value_or( + DL->getABITypeAlign(DataType->getElementType())); + Mask = II->getArgOperand(2); + EVL = II->getArgOperand(3); + break; + default: + llvm_unreachable("Unexpected intrinsic"); + } + // Make sure the operation will be supported by the backend. - MaybeAlign MA = cast(AlignOp)->getMaybeAlignValue(); EVT DataTypeVT = TLI->getValueType(*DL, DataType); if (!MA || !TLI->isLegalStridedLoadStore(DataTypeVT, *MA)) return false; @@ -514,23 +547,27 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II, Builder.SetInsertPoint(II); - Value *EVL = Builder.CreateElementCount( - IntegerType::get(Ctx, 32), cast(DataType)->getElementCount()); + if (!EVL) + EVL = Builder.CreateElementCount( + Builder.getInt32Ty(), cast(DataType)->getElementCount()); CallInst *Call; - if (II->getIntrinsicID() == Intrinsic::masked_gather) { + + if (!StoreVal) { Call = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_load, {DataType, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, II->getArgOperand(2), EVL}); - Call = Builder.CreateIntrinsic( - Intrinsic::vp_select, {DataType}, - {II->getOperand(2), Call, II->getArgOperand(3), EVL}); + {BasePtr, Stride, Mask, EVL}); + + // Merge llvm.masked.gather's passthru + if (II->getIntrinsicID() == Intrinsic::masked_gather) + Call = Builder.CreateIntrinsic(Intrinsic::vp_select, {DataType}, + {Mask, Call, II->getArgOperand(3), EVL}); } else Call = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_store, {DataType, BasePtr->getType(), Stride->getType()}, - {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL}); + {StoreVal, BasePtr, Stride, Mask, EVL}); Call->takeName(II); II->replaceAllUsesWith(Call); @@ -558,30 +595,31 @@ bool RISCVGatherScatterLowering::runOnFunction(Function &F) { StridedAddrs.clear(); - SmallVector Gathers; - SmallVector Scatters; + SmallVector Worklist; bool Changed = false; for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { - Gathers.push_back(II); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { - Scatters.push_back(II); + if (!II) + continue; + switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: + case Intrinsic::masked_scatter: + case Intrinsic::vp_gather: + case Intrinsic::vp_scatter: + Worklist.push_back(II); + break; + default: + break; } } } // Rewrite gather/scatter to form strided load/store if possible. - for (auto *II : Gathers) - Changed |= tryCreateStridedLoadStore( - II, II->getType(), II->getArgOperand(0), II->getArgOperand(1)); - for (auto *II : Scatters) - Changed |= - tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(), - II->getArgOperand(1), II->getArgOperand(2)); + for (auto *II : Worklist) + Changed |= tryCreateStridedLoadStore(II); // Remove any dead phis. while (!MaybeDeadPHIs.empty()) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll index 2cbbfc019ab4d..83a9b23a387d2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll @@ -1030,3 +1030,114 @@ vector.body: ; preds = %vector.body, %entry for.cond.cleanup: ; preds = %vector.body ret void } + +define void @vp_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { +; CHECK-LABEL: @vp_gather( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false) +; CHECK-NEXT: [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer +; CHECK-NEXT: [[WIDE_VP_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1 +; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_VP_GATHER]] +; CHECK-NEXT: store <32 x i8> [[I4]], ptr [[I2]], align 1 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32) +; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024 +; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) + %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i + + %elems = sub i64 1024, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false) + + %odd = and <32 x i64> %vec.ind, splat (i64 1) + %mask = icmp ne <32 x i64> %odd, splat (i64 0) + + %wide.vp.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i1, <32 x i1> %mask, i32 %evl) + %i2 = getelementptr inbounds i8, ptr %A, i64 %index + %wide.load = load <32 x i8>, ptr %i2, align 1 + %i4 = add <32 x i8> %wide.load, %wide.vp.gather + store <32 x i8> %i4, ptr %i2, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) + %i6 = icmp eq i64 %index.next, 1024 + br i1 %i6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @vp_scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { +; CHECK-LABEL: @vp_scatter( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false) +; CHECK-NEXT: [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32) +; CHECK-NEXT: [[I5:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024 +; CHECK-NEXT: br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %i = getelementptr inbounds i8, ptr %B, i64 %index + %wide.load = load <32 x i8>, ptr %i, align 1 + %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5) + %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2 + + + %elems = sub i64 1024, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false) + + %odd = and <32 x i64> %vec.ind, splat (i64 1) + %mask = icmp ne <32 x i64> %odd, splat (i64 0) + + %wide.masked.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i3, <32 x i1> %mask, i32 %evl) + %i4 = add <32 x i8> %wide.masked.gather, %wide.load + call void @llvm.vp.scatter(<32 x i8> %i4, <32 x ptr> %i3, <32 x i1> %mask, i32 %evl) + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32) + %i5 = icmp eq i64 %index.next, 1024 + br i1 %i5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index b1ece9fa8272d..7c1fab9bfe91a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -398,3 +398,126 @@ define @vector_base_vector_offset(ptr %p, declare i64 @llvm.vscale.i64() declare void @llvm.masked.scatter.nxv1i64.nxv1p0(, , i32, ) declare @llvm.masked.gather.nxv1i64.nxv1p0(, i32, , ) + + +; TODO: Make the step loop variant to reflect what the loop vectorizer will emit +; in an EVL tail folding configuration. + +define @vp_gather(ptr %a, i32 %len) { +; CHECK-LABEL: @vp_gather( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = tail call @llvm.stepvector.nxv1i64() + %.splatinsert = insertelement poison, i64 %0, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] + + %elems = sub i64 %wide.trip.count, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) + + %odd = and %vec.ind, splat (i64 1) + %mask = icmp ne %odd, splat (i64 0) + + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + %gather = call @llvm.vp.gather( %2, %mask, i32 %evl) + %accum.next = add %accum, %gather + %index.next = add nuw i64 %index, %0 + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + +; TODO: Make the step loop variant to reflect what the loop vectorizer will emit +; in an EVL tail folding configuration. + +define void @vp_scatter(ptr %a, i32 %len) { +; CHECK-LABEL: @vp_scatter( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) +; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = tail call @llvm.stepvector.nxv1i64() + %.splatinsert = insertelement poison, i64 %0, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + + %elems = sub i64 %wide.trip.count, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) + + %odd = and %vec.ind, splat (i64 1) + %mask = icmp ne %odd, splat (i64 0) + + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + tail call void @llvm.vp.scatter( zeroinitializer, %2, %mask, i32 %evl) + %index.next = add nuw i64 %index, %0 + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +}