-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Improve vector funnel shift by constant costs. #130044
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We now have better codegen, and can have better costs to match. The generated code should now produce a shl+usra and can be seen in testcases such as: https://github.com/llvm/llvm-project/blob/7e5821bae80db3f3f0fe0d5f8ce62f79e548eed5/llvm/test/CodeGen/AArch64/fsh.ll#L3941.
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-analysis Author: David Green (davemgreen) ChangesWe now have better codegen, and can have better costs to match. The generated code should now produce a shl+usra and can be seen in testcases such as: llvm-project/llvm/test/CodeGen/AArch64/fsh.ll Line 3941 in 7e5821b
Full diff: https://github.com/llvm/llvm-project/pull/130044.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ba019e1a4ecd5..53c6a029e9287 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -884,12 +884,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
const auto LegalisationCost = getTypeLegalizationCost(RetTy);
if (OpInfoZ.isUniform()) {
- // FIXME: The costs could be lower if the codegen is better.
static const CostTblEntry FshlTbl[] = {
- {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
- {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
- {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
- {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
+ {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
+ {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
+ {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
+ {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
// to avoid having to duplicate the costs.
const auto *Entry =
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 8c6466ab470b4..c59b41157c304 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -129,11 +129,11 @@ declare i19 @llvm.fshl.i19(i19, i19, i19)
define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
; RECIP-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
;
; SIZE-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %fshl
;
entry:
@@ -173,11 +173,11 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
; RECIP-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
;
; SIZE-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %fshl
;
entry:
@@ -217,11 +217,11 @@ declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
; RECIP-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
;
; SIZE-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %fshl
;
entry:
@@ -261,11 +261,11 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
; RECIP-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
;
; SIZE-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %fshl
;
entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index 120b1c4c4c4ef..e4cea1ddf77a0 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -129,11 +129,11 @@ declare i19 @llvm.fshr.i19(i19, i19, i19)
define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
; RECIP-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr
;
; SIZE-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %fshr
;
entry:
@@ -173,11 +173,11 @@ declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
; RECIP-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr
;
; SIZE-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %fshr
;
entry:
@@ -217,11 +217,11 @@ declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
; RECIP-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr
;
; SIZE-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %fshr
;
entry:
@@ -261,11 +261,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
; RECIP-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
; RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr
;
; SIZE-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %fshr
;
entry:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
We now have better codegen, and can have better costs to match. The generated code should now produce a shl+usra and can be seen in testcases such as:
llvm-project/llvm/test/CodeGen/AArch64/fsh.ll
Line 3941 in 7e5821b