-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Fix BE popcount casts. #129879
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Fix BE popcount casts. #129879
Conversation
A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endiad. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesA bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes #129843 Full diff: https://github.com/llvm/llvm-project/pull/129879.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 52ec4753ec4c1..ef5d833c03428 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10788,7 +10788,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
if (VT == MVT::i32)
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
DAG.getConstant(0, DL, MVT::i64));
- AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+ else
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
+ DAG.getConstant(0, DL, MVT::i64));
if (IsParity)
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
return AddV;
@@ -10797,7 +10800,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
- AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+ DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
+ DAG.getConstant(0, DL, MVT::i64));
+ AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
if (IsParity)
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
return AddV;
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 369667ec33f66..d06e42f5405ef 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
; CHECK-BE-NEXT: cnt v0.8b, v0.8b
; CHECK-BE-NEXT: addv b0, v0.8b
-; CHECK-BE-NEXT: rev64 v0.8b, v0.8b
; CHECK-BE-NEXT: fmov x0, d0
; CHECK-BE-NEXT: ret
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
@@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: mov.d v0[1], x1
+; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: cnt.16b v0, v0
; CHECK-NEXT: addv.16b b0, v0
-; CHECK-NEXT: mov.d x1, v0[1]
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
;
@@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone {
; CHECK-BE-LABEL: cnt128:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov d0, x0
+; CHECK-BE-NEXT: mov x0, xzr
; CHECK-BE-NEXT: mov v0.d[1], x1
; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
; CHECK-BE-NEXT: cnt v0.16b, v0.16b
; CHECK-BE-NEXT: addv b0, v0.16b
-; CHECK-BE-NEXT: rev64 v0.16b, v0.16b
-; CHECK-BE-NEXT: mov x1, v0.d[1]
-; CHECK-BE-NEXT: fmov x0, d0
+; CHECK-BE-NEXT: fmov x1, d0
; CHECK-BE-NEXT: ret
%cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
ret i128 %cnt
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 1e51793fb5f91..91515277cb3f6 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) {
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: cnt v0.8b, v0.8b
; CHECK-NEXT: addv b0, v0.8b
-; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 6cc925f0ae91f..e664e73594923 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
; BE-NEXT: rev64 v0.16b, v0.16b
; BE-NEXT: cnt v0.16b, v0.16b
; BE-NEXT: addv b0, v0.16b
-; BE-NEXT: rev32 v0.16b, v0.16b
-; BE-NEXT: mov w0, v0.s[3]
+; BE-NEXT: rev64 v0.4s, v0.4s
+; BE-NEXT: mov w0, v0.s[1]
; BE-NEXT: ret
;
; GISEL-LABEL: popcount128:
@@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
; BE-NEXT: cnt v1.16b, v1.16b
; BE-NEXT: addv b0, v0.16b
; BE-NEXT: addv b1, v1.16b
-; BE-NEXT: rev32 v0.16b, v0.16b
-; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: mov w8, v0.s[3]
-; BE-NEXT: mov w9, v1.s[3]
+; BE-NEXT: rev64 v0.4s, v0.4s
+; BE-NEXT: rev64 v1.4s, v1.4s
+; BE-NEXT: mov w8, v0.s[1]
+; BE-NEXT: mov w9, v1.s[1]
; BE-NEXT: add w0, w9, w8
; BE-NEXT: ret
;
@@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
; CHECK: // %bb.0: // %Entry
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: mov v0.d[1], x1
+; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: cnt v0.16b, v0.16b
; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: mov x1, v0.d[1]
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
;
; BE-LABEL: popcount1x128:
; BE: // %bb.0: // %Entry
; BE-NEXT: fmov d0, x0
+; BE-NEXT: mov x0, xzr
; BE-NEXT: mov v0.d[1], x1
; BE-NEXT: rev64 v0.16b, v0.16b
; BE-NEXT: cnt v0.16b, v0.16b
; BE-NEXT: addv b0, v0.16b
-; BE-NEXT: rev64 v0.16b, v0.16b
-; BE-NEXT: mov x1, v0.d[1]
-; BE-NEXT: fmov x0, d0
+; BE-NEXT: fmov x1, d0
; BE-NEXT: ret
;
; GISEL-LABEL: popcount1x128:
|
@usha1830 FYI - I wasn't able to add you as a reviewer yet. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me. Thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not familiar with this code but it seems sensible at first glance. Thanks!
This comment was marked as outdated.
This comment was marked as outdated.
A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843 (cherry picked from commit ab811e7)
A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843
A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0.
Fixes #129843