From cd9276319c08be8bf16b1bae7d4b69b9e52551b3 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 9 Jan 2025 18:17:28 +0800 Subject: [PATCH 1/4] Precommit test --- .../CodeGen/RISCV/rvv/strided-load-store.ll | 228 +++++++++++++++++- 1 file changed, 216 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index f777c450bc106..023372dc19d22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -105,6 +105,112 @@ for.cond.cleanup: ; preds = %vector.body ret %accum.next } +define @gather_non_invariant_step(ptr %a, ptr %b, i32 %len) { +; CHECK-LABEL: @gather_non_invariant_step( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP1]], i32 8, splat (i1 true), undef) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + %gather = call @llvm.masked.gather.nxv1i64.nxv1p0( %2, i32 8, splat (i1 true), undef) + %accum.next = add %accum, %gather + + %b.gep = getelementptr i64, ptr %b, i64 %index + %step = load i64, ptr %b.gep + %index.next = add nuw i64 %index, %step + %.splatinsert = insertelement poison, i64 %step, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + +define @gather_non_invariant_step_shl(ptr %a, ptr %b, i32 %len) { +; CHECK-LABEL: @gather_non_invariant_step_shl( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_ADD:%.*]] = add [[VEC_IND]], splat (i64 42) +; CHECK-NEXT: [[VEC_IND_SHL:%.*]] = shl [[VEC_IND_ADD]], splat (i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND_SHL]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP1]], i32 8, splat (i1 true), undef) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] + + %vec.ind.add = add %vec.ind, splat (i64 42) + %vec.ind.shl = shl %vec.ind.add, splat (i64 2) + + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind.shl, i32 3 + %gather = call @llvm.masked.gather.nxv1i64.nxv1p0( %2, i32 8, splat (i1 true), undef) + %accum.next = add %accum, %gather + + %b.gep = getelementptr i64, ptr %b, i64 %index + %step = load i64, ptr %b.gep + %index.next = add nuw i64 %index, %step + %.splatinsert = insertelement poison, i64 %step, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + define void @scatter(ptr %a, i32 %len) { ; CHECK-LABEL: @scatter( ; CHECK-NEXT: vector.ph: @@ -146,6 +252,104 @@ for.cond.cleanup: ; preds = %vector.body ret void } +define void @scatter_non_invariant_step(ptr %a, ptr %b, i32 %len) { +; CHECK-LABEL: @scatter_non_invariant_step( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], i32 8, splat (i1 true)) +; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, %2, i32 8, splat (i1 true)) + + %b.gep = getelementptr i64, ptr %b, i64 %index + %step = load i64, ptr %b.gep + %index.next = add nuw i64 %index, %step + %.splatinsert = insertelement poison, i64 %step, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @scatter_non_invariant_step_add_shl(ptr %a, ptr %b, i32 %len) { +; CHECK-LABEL: @scatter_non_invariant_step_add_shl( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_ADD:%.*]] = add [[VEC_IND]], splat (i64 42) +; CHECK-NEXT: [[VEC_IND_SHL:%.*]] = shl [[VEC_IND_ADD]], splat (i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND_SHL]], i32 3 +; CHECK-NEXT: tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], i32 8, splat (i1 true)) +; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + + %vec.ind.add = add %vec.ind, splat (i64 42) + %vec.ind.shl = shl %vec.ind.add, splat (i64 2) + + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind.shl, i32 3 + tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, %2, i32 8, splat (i1 true)) + + %b.gep = getelementptr i64, ptr %b, i64 %index + %step = load i64, ptr %b.gep + %index.next = add nuw i64 %index, %step + %.splatinsert = insertelement poison, i64 %step, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + define @gather_loopless(ptr %p, i64 %stride) { ; CHECK-LABEL: @gather_loopless( ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE:%.*]], 4 @@ -491,23 +695,23 @@ define @evl_gather(ptr %a, i32 %len) { ; CHECK-LABEL: @evl_gather( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.gather.nxv1i64.nxv1p0( [[TMP2]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.gather.nxv1i64.nxv1p0( [[TMP1]], splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[EVL_ZEXT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] ; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 ; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret [[ACCUM_NEXT]] @@ -551,18 +755,18 @@ define void @evl_scatter(ptr %a, i32 %len) { ; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR1]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 ; CHECK-NEXT: tail call void @llvm.vp.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 -; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add nuw i64 [[VEC_IND_SCALAR1]], [[EVL_ZEXT]] ; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 ; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR1]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void From 3e767a4c7c3ff58a9156d7ec041d76cb7707d40a Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 9 Jan 2025 18:19:40 +0800 Subject: [PATCH 2/4] [RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232) In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length. We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader. I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise. --- .../RISCV/RISCVGatherScatterLowering.cpp | 21 +++-- .../rvv/fixed-vectors-strided-load-store.ll | 4 +- .../CodeGen/RISCV/rvv/strided-load-store.ll | 78 ++++++++----------- 3 files changed, 49 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index a71e6bbb93638..74ff17637af6c 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -211,10 +211,6 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, assert(Phi->getIncomingValue(IncrementingBlock) == Inc && "Expected one operand of phi to be Inc"); - // Only proceed if the step is loop invariant. - if (!L->isLoopInvariant(Step)) - return false; - // Step should be a splat. Step = getSplatValue(Step); if (!Step) @@ -310,18 +306,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, } case Instruction::Mul: { Start = Builder.CreateMul(Start, SplatOp, "start"); - Step = Builder.CreateMul(Step, SplatOp, "step"); Stride = Builder.CreateMul(Stride, SplatOp, "stride"); break; } case Instruction::Shl: { Start = Builder.CreateShl(Start, SplatOp, "start"); - Step = Builder.CreateShl(Step, SplatOp, "step"); Stride = Builder.CreateShl(Stride, SplatOp, "stride"); break; } } + // Adjust the step value after its definition if it's an instruction. + if (auto *StepI = dyn_cast(Step)) + Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef()); + + switch (BO->getOpcode()) { + default: + break; + case Instruction::Mul: + Step = Builder.CreateMul(Step, SplatOp, "step"); + break; + case Instruction::Shl: + Step = Builder.CreateShl(Step, SplatOp, "step"); + break; + } + Inc->setOperand(StepIndex, Step); BasePtr->setIncomingValue(StartBlock, Start); return true; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll index 83a9b23a387d2..84de566e05dff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll @@ -320,8 +320,8 @@ for.cond.cleanup: ; preds = %vector.body define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) { ; CHECK-LABEL: @gather_unknown_pow2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]] -; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT]] +; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]] +; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT]] ; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STRIDE]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index 023372dc19d22..08997785c74f9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -109,21 +109,20 @@ define @gather_non_invariant_step(ptr %a, ptr %b, i32 %len) { ; CHECK-LABEL: @gather_non_invariant_step( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP1]], i32 8, splat (i1 true), undef) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 16, splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.select.nxv1i64( splat (i1 true), [[TMP3]], undef, i32 [[TMP1]]) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -159,23 +158,21 @@ define @gather_non_invariant_step_shl(ptr %a, ptr %b, i32 %le ; CHECK-LABEL: @gather_non_invariant_step_shl( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 168, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND_ADD:%.*]] = add [[VEC_IND]], splat (i64 42) -; CHECK-NEXT: [[VEC_IND_SHL:%.*]] = shl [[VEC_IND_ADD]], splat (i64 2) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND_SHL]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.masked.gather.nxv1i64.nxv1p0( [[TMP1]], i32 8, splat (i1 true), undef) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 64, splat (i1 true), i32 [[TMP1]]) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.select.nxv1i64( splat (i1 true), [[TMP3]], undef, i32 [[TMP1]]) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[STEP1:%.*]] = shl i64 [[STEP]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP1]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -256,19 +253,17 @@ define void @scatter_non_invariant_step(ptr %a, ptr %b, i32 %len) { ; CHECK-LABEL: @scatter_non_invariant_step( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 -; CHECK-NEXT: tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], i32 8, splat (i1 true)) +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP0]], i64 16, splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -302,21 +297,18 @@ define void @scatter_non_invariant_step_add_shl(ptr %a, ptr %b, i32 %len) { ; CHECK-LABEL: @scatter_non_invariant_step_add_shl( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND_ADD:%.*]] = add [[VEC_IND]], splat (i64 42) -; CHECK-NEXT: [[VEC_IND_SHL:%.*]] = shl [[VEC_IND_ADD]], splat (i64 2) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND_SHL]], i32 3 -; CHECK-NEXT: tail call void @llvm.masked.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], i32 8, splat (i1 true)) +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 168, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP0]], i64 64, splat (i1 true), i32 [[TMP1]]) ; CHECK-NEXT: [[B:%.*]] = getelementptr i64, ptr [[B1:%.*]], i64 [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[STEP:%.*]] = load i64, ptr [[B]], align 8 +; CHECK-NEXT: [[STEP1:%.*]] = shl i64 [[STEP]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[STEP]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[STEP]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP1]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -695,22 +687,19 @@ define @evl_gather(ptr %a, i32 %len) { ; CHECK-LABEL: @evl_gather( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.gather.nxv1i64.nxv1p0( [[TMP1]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 16, splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] -; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 -; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[EVL_ZEXT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -752,20 +741,17 @@ define void @evl_scatter(ptr %a, i32 %len) { ; CHECK-LABEL: @evl_scatter( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR1]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 -; CHECK-NEXT: tail call void @llvm.vp.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3 +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP0]], i64 16, splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add nuw i64 [[VEC_IND_SCALAR1]], [[EVL_ZEXT]] -; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 -; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR1]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: From 2803322315888117f0b8121cb24b596e45f86baa Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 15 Jan 2025 18:31:02 +0800 Subject: [PATCH 3/4] Add todo about deduplication --- llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 74ff17637af6c..e86b5e4c763f3 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -294,6 +294,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, BasePtr->getIncomingBlock(StartBlock)->getTerminator()); Builder.SetCurrentDebugLocation(DebugLoc()); + // TODO: Share this switch with matchStridedStart? switch (BO->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); From ec358834c99f0222262d35c5fed3fd24e787cd0d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 16 Jan 2025 19:24:49 +0800 Subject: [PATCH 4/4] Only adjust the insert point if the step was defined in the loop --- .../RISCV/RISCVGatherScatterLowering.cpp | 5 +- .../CodeGen/RISCV/rvv/strided-load-store.ll | 57 +++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index e86b5e4c763f3..39c0af7985971 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -317,8 +317,9 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, } } - // Adjust the step value after its definition if it's an instruction. - if (auto *StepI = dyn_cast(Step)) + // If the Step was defined inside the loop, adjust it before its definition + // instead of in the preheader. + if (auto *StepI = dyn_cast(Step); StepI && L->contains(StepI)) Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef()); switch (BO->getOpcode()) { diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index 08997785c74f9..45f158f929ca8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -208,6 +208,63 @@ for.cond.cleanup: ; preds = %vector.body ret %accum.next } +; Check that the operand of the binary op (%scale.splat in shl) always dominates +; the existing step value when we're adjusting it. +define @gather_splat_op_after_step(ptr %a, ptr %b, i32 %len) { +; CHECK-LABEL: @gather_splat_op_after_step( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[SCALE:%.*]] = load i64, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SCALE]] +; CHECK-NEXT: [[STEP:%.*]] = shl i64 [[TMP0]], [[SCALE]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 [[TMP1]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.select.nxv1i64( splat (i1 true), [[TMP4]], undef, i32 [[TMP3]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]] +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[STEP]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = tail call @llvm.stepvector.nxv1i64() + %.splatinsert = insertelement poison, i64 %0, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + + %scale = load i64, ptr %b + %scale.head = insertelement poison, i64 %scale, i64 0 + %scale.splat = shufflevector %scale.head, poison, zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] + %vec.ind.shl = shl %vec.ind, %scale.splat + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind.shl, i32 3 + %gather = call @llvm.masked.gather.nxv1i64.nxv1p0( %2, i32 8, splat (i1 true), undef) + %accum.next = add %accum, %gather + %index.next = add nuw i64 %index, %0 + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + define void @scatter(ptr %a, i32 %len) { ; CHECK-LABEL: @scatter( ; CHECK-NEXT: vector.ph: