-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[RISCV] Sink splatted fpext operands #125800
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We sink splatted operands in codegenprepare to help match .vx/.vf patterns. This extends it to also splat any fpext so that we can match widening vfwadd.vf/vfwadd.wf patterns too. Some instructions don't have .wf forms so there's no benefit to sinking the fpext. For simplicity this sinks them anyway and lets earlymachine-licm hoist them back out.
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesWe sink splatted operands in codegenprepare to help match .vx/.vf patterns. This extends it to also splat any fpext so that we can match widening vfwadd.vf/vfwadd.wf patterns too. Some instructions don't have .wf forms so there's no benefit to sinking the fpext. For simplicity this sinks them anyway and lets earlymachine-licm hoist them back out. Full diff: https://github.com/llvm/llvm-project/pull/125800.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fa7c7c57be25724..6799130bde130c7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2712,7 +2712,12 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
return false;
}
- Ops.push_back(&Op->getOperandUse(0));
+ Use *InsertEltUse = &Op->getOperandUse(0);
+ // Sink any fpexts since they might be used in a widening fp pattern.
+ auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
+ if (isa<FPExtInst>(InsertElt->getOperand(1)))
+ Ops.push_back(&InsertElt->getOperandUse(1));
+ Ops.push_back(InsertEltUse);
Ops.push_back(&OpIdx.value());
}
return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 735621aa4390e39..1948675ae9cf0a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -5741,3 +5741,152 @@ vector.body:
for.cond.cleanup:
ret void
}
+
+define void @sink_splat_vfwadd_vf(ptr nocapture %a, ptr nocapture %b, float %f) {
+; CHECK-LABEL: sink_splat_vfwadd_vf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a2, 1020
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: .LBB125_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: addi a1, a1, 4
+; CHECK-NEXT: addi a2, a2, -4
+; CHECK-NEXT: vfwadd.vf v10, v8, fa0
+; CHECK-NEXT: vs2r.v v10, (a0)
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: j .LBB125_1
+entry:
+ %f.ext = fpext float %f to double
+ %broadcast.splatinsert = insertelement <vscale x 2 x double> poison, double %f.ext, i32 0
+ %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr float, ptr %a, i64 %index
+ %wide.load = load <vscale x 2 x float>, ptr %0
+ %ext = fpext <vscale x 2 x float> %wide.load to <vscale x 2 x double>
+ %1 = fadd <vscale x 2 x double> %ext, %broadcast.splat
+ %2 = getelementptr double, ptr %b, i64 %index
+ store <vscale x 2 x double> %1, ptr %0
+ %index.next = add i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 32, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
+
+define void @sink_splat_vfwadd_wf(ptr nocapture %a, ptr nocapture %b, float %f) {
+; CHECK-LABEL: sink_splat_vfwadd_wf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a2, 1020
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: .LBB126_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: addi a1, a1, 4
+; CHECK-NEXT: addi a2, a2, -4
+; CHECK-NEXT: vfwadd.wf v8, v8, fa0
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 32
+; CHECK-NEXT: j .LBB126_1
+entry:
+ %f.ext = fpext float %f to double
+ %broadcast.splatinsert = insertelement <vscale x 2 x double> poison, double %f.ext, i32 0
+ %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr double, ptr %a, i64 %index
+ %wide.load = load <vscale x 2 x double>, ptr %0
+ %1 = fadd <vscale x 2 x double> %wide.load, %broadcast.splat
+ %2 = getelementptr double, ptr %b, i64 %index
+ store <vscale x 2 x double> %1, ptr %0
+ %index.next = add i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 32, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
+
+define void @sink_splat_vfwmul_vf(ptr nocapture %a, ptr nocapture %b, float %f) {
+; CHECK-LABEL: sink_splat_vfwmul_vf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a2, 1020
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: .LBB127_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: addi a1, a1, 4
+; CHECK-NEXT: addi a2, a2, -4
+; CHECK-NEXT: vfwmul.vf v10, v8, fa0
+; CHECK-NEXT: vs2r.v v10, (a0)
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: j .LBB127_1
+entry:
+ %f.ext = fpext float %f to double
+ %broadcast.splatinsert = insertelement <vscale x 2 x double> poison, double %f.ext, i32 0
+ %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr float, ptr %a, i64 %index
+ %wide.load = load <vscale x 2 x float>, ptr %0
+ %ext = fpext <vscale x 2 x float> %wide.load to <vscale x 2 x double>
+ %1 = fmul <vscale x 2 x double> %ext, %broadcast.splat
+ %2 = getelementptr double, ptr %b, i64 %index
+ store <vscale x 2 x double> %1, ptr %0
+ %index.next = add i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 32, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
+
+; Even though there's no vfwmul.wf we'll sink the fcvt.d.s. Make sure
+; early-machinelicm undos the sink after isel.
+define void @sink_splat_vfwmul_wf(ptr nocapture %a, ptr nocapture %b, float %f) {
+; CHECK-LABEL: sink_splat_vfwmul_wf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a2, 1020
+; CHECK-NEXT: fcvt.d.s fa5, fa0
+; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma
+; CHECK-NEXT: .LBB128_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: addi a1, a1, 4
+; CHECK-NEXT: addi a2, a2, -4
+; CHECK-NEXT: vfmul.vf v8, v8, fa5
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: j .LBB128_1
+entry:
+ %f.ext = fpext float %f to double
+ %broadcast.splatinsert = insertelement <vscale x 2 x double> poison, double %f.ext, i32 0
+ %broadcast.splat = shufflevector <vscale x 2 x double> %broadcast.splatinsert, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %0 = getelementptr float, ptr %a, i64 %index
+ %wide.load = load <vscale x 2 x double>, ptr %0
+ %1 = fmul <vscale x 2 x double> %wide.load, %broadcast.splat
+ %2 = getelementptr double, ptr %b, i64 %index
+ store <vscale x 2 x double> %1, ptr %0
+ %index.next = add i64 %index, 4
+ %3 = icmp eq i64 %index.next, 1024
+ br i1 32, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret void
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We sink splatted operands in codegenprepare to help match .vx/.vf patterns. This extends it to also splat any fpext so that we can match widening vfwadd.vf/vfwadd.wf patterns too. Some instructions don't have .wf forms so there's no benefit to sinking the fpext. For simplicity this sinks them anyway and lets earlymachine-licm hoist them back out.
We sink splatted operands in codegenprepare to help match .vx/.vf patterns. This extends it to also splat any fpext so that we can match widening vfwadd.vf/vfwadd.wf patterns too.
Some instructions don't have .wf forms so there's no benefit to sinking the fpext. For simplicity this sinks them anyway and lets earlymachine-licm hoist them back out.