[AArch64] Fix BE popcount casts. #129879

davemgreen · 2025-03-05T12:22:30Z

A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0.

Fixes #129843

A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endiad. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843

llvmbot · 2025-03-05T12:23:02Z

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV in the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0.

Fixes #129843

Full diff: https://github.com/llvm/llvm-project/pull/129879.diff

4 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+8-2)
(modified) llvm/test/CodeGen/AArch64/arm64-popcnt.ll (+3-5)
(modified) llvm/test/CodeGen/AArch64/parity.ll (+1-1)
(modified) llvm/test/CodeGen/AArch64/popcount.ll (+9-10)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 52ec4753ec4c1..ef5d833c03428 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10788,7 +10788,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
     if (VT == MVT::i32)
       AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
                          DAG.getConstant(0, DL, MVT::i64));
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    else
+      AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                         DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
+                         DAG.getConstant(0, DL, MVT::i64));
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
@@ -10797,7 +10800,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
 
     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
     SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
-    AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
+    AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+                       DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
+                       DAG.getConstant(0, DL, MVT::i64));
+    AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
     if (IsParity)
       AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
     return AddV;
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 369667ec33f66..d06e42f5405ef 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -129,7 +129,6 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-BE-NEXT:    addv b0, v0.8b
-; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    fmov x0, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
@@ -436,9 +435,9 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt.16b v0, v0
 ; CHECK-NEXT:    addv.16b b0, v0
-; CHECK-NEXT:    mov.d x1, v0[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
@@ -481,13 +480,12 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; CHECK-BE-LABEL: cnt128:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    fmov d0, x0
+; CHECK-BE-NEXT:    mov x0, xzr
 ; CHECK-BE-NEXT:    mov v0.d[1], x1
 ; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
 ; CHECK-BE-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-BE-NEXT:    addv b0, v0.16b
-; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-BE-NEXT:    mov x1, v0.d[1]
-; CHECK-BE-NEXT:    fmov x0, d0
+; CHECK-BE-NEXT:    fmov x1, d0
 ; CHECK-BE-NEXT:    ret
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 1e51793fb5f91..91515277cb3f6 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -159,7 +159,7 @@ define i32 @parity_64_trunc(i64 %x) {
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 6cc925f0ae91f..e664e73594923 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -41,8 +41,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    mov w0, v0.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    mov w0, v0.s[1]
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount128:
@@ -138,10 +138,10 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; BE-NEXT:    cnt v1.16b, v1.16b
 ; BE-NEXT:    addv b0, v0.16b
 ; BE-NEXT:    addv b1, v1.16b
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    mov w8, v0.s[3]
-; BE-NEXT:    mov w9, v1.s[3]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    rev64 v1.4s, v1.4s
+; BE-NEXT:    mov w8, v0.s[1]
+; BE-NEXT:    mov w9, v1.s[1]
 ; BE-NEXT:    add w0, w9, w8
 ; BE-NEXT:    ret
 ;
@@ -227,22 +227,21 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK:       // %bb.0: // %Entry
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov v0.d[1], x1
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
 ; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    mov x1, v0.d[1]
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
 ;
 ; BE-LABEL: popcount1x128:
 ; BE:       // %bb.0: // %Entry
 ; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    mov x0, xzr
 ; BE-NEXT:    mov v0.d[1], x1
 ; BE-NEXT:    rev64 v0.16b, v0.16b
 ; BE-NEXT:    cnt v0.16b, v0.16b
 ; BE-NEXT:    addv b0, v0.16b
-; BE-NEXT:    rev64 v0.16b, v0.16b
-; BE-NEXT:    mov x1, v0.d[1]
-; BE-NEXT:    fmov x0, d0
+; BE-NEXT:    fmov x1, d0
 ; BE-NEXT:    ret
 ;
 ; GISEL-LABEL: popcount1x128:

davemgreen · 2025-03-05T12:23:09Z

@usha1830 FYI - I wasn't able to add you as a reviewer yet.

usha1830

Looks good to me. Thanks!

nasherm

LGTM

alexrp

Not familiar with this code but it seems sensible at first glance. Thanks!

A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843 (cherry picked from commit ab811e7)

A bitcast, being defined as a load and a store, can change the lane order. We need to use a NVCAST instead to keep the lanes out of the VADDV the same in big-endian. The extracting from a v2i64 vector is to keep the types of the nvcast legal, but also allow us to replace a lane mov with a mov 0. Fixes llvm#129843

davemgreen requested review from alexrp, c-rhodes, nasherm and david-arm March 5, 2025 12:22

llvmbot added the backend:AArch64 label Mar 5, 2025

usha1830 approved these changes Mar 5, 2025

View reviewed changes

nasherm approved these changes Mar 5, 2025

View reviewed changes

alexrp approved these changes Mar 5, 2025

View reviewed changes

davemgreen merged commit ab811e7 into llvm:main Mar 5, 2025
13 checks passed

davemgreen deleted the gh-a64-bepopcount branch March 5, 2025 20:08

This comment was marked as outdated.

Sign in to view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Fix BE popcount casts. #129879

[AArch64] Fix BE popcount casts. #129879

davemgreen commented Mar 5, 2025

llvmbot commented Mar 5, 2025

davemgreen commented Mar 5, 2025

usha1830 left a comment

nasherm left a comment

alexrp left a comment

This comment was marked as outdated.

[AArch64] Fix BE popcount casts. #129879

[AArch64] Fix BE popcount casts. #129879

Conversation

davemgreen commented Mar 5, 2025

llvmbot commented Mar 5, 2025

davemgreen commented Mar 5, 2025

usha1830 left a comment

Choose a reason for hiding this comment

nasherm left a comment

Choose a reason for hiding this comment

alexrp left a comment

Choose a reason for hiding this comment

This comment was marked as outdated.