From 24780e80a2f1e9ae1d7979dcd15f2894040f702c Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 15:22:15 +0200 Subject: [PATCH 1/9] Add AArch64 lowering for @llvm.experimental.vector.compress --- .../SelectionDAG/LegalizeVectorTypes.cpp | 62 +++++++- .../Target/AArch64/AArch64ISelLowering.cpp | 148 ++++++++++++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + 3 files changed, 206 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5672b611234b8..f8981255f8dd6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2408,11 +2408,61 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi) { // This is not "trivial", as there is a dependency between the two subvectors. // Depending on the number of 1s in the mask, the elements from the Hi vector - // need to be moved to the Lo vector. So we just perform this as one "big" - // operation and then extract the Lo and Hi vectors from that. This gets rid - // of VECTOR_COMPRESS and all other operands can be legalized later. - SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); - std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N)); + // need to be moved to the Lo vector. Passthru values make this even harder. + // We try to use MASKED_COMPRESS if the target has custom lowering with + // smaller types and passthru is undef, as it is most likely faster than the + // fully expand path. Otherwise, just do the full expansion as one "big" + // operation and then extract the Lo and Hi vectors from that. This gets + // rid of MASKED_COMPRESS and all other operands can be legalized later. + SDLoc DL(N); + EVT VecVT = N->getValueType(0); + + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT); + bool HasLegalOrCustom = false; + EVT CheckVT = LoVT; + while (CheckVT.getVectorMinNumElements() > 1) { + if (TLI.isOperationLegalOrCustom(ISD::VECTOR_COMPRESS, CheckVT)) { + HasLegalOrCustom = true; + break; + } + CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext()); + } + + SDValue Passthru = N->getOperand(2); + if (!HasLegalOrCustom || !Passthru.isUndef()) { + SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT); + return; + } + + // Try to VECTOR_COMPRESS smaller vectors and combine via a stack store+load. + SDValue LoMask, HiMask; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + std::tie(LoMask, HiMask) = SplitMask(N->getOperand(1)); + + SDValue UndefPassthru = DAG.getUNDEF(LoVT); + Lo = DAG.getNode(ISD::VECTOR_COMPRESS, DL, LoVT, Lo, LoMask, UndefPassthru); + Hi = DAG.getNode(ISD::VECTOR_COMPRESS, DL, HiVT, Hi, HiMask, UndefPassthru); + + SDValue StackPtr = DAG.CreateStackTemporary( + VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false)); + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack( + MF, cast(StackPtr.getNode())->getIndex()); + + // We store LoVec and then insert HiVec starting at offset=|1s| in LoMask. + SDValue WideMask = + DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask); + SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask); + Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset); + + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getStore(Chain, DL, Lo, StackPtr, PtrInfo); + Chain = DAG.getStore(Chain, DL, Hi, Offset, + MachinePointerInfo::getUnknownStack(MF)); + + SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); + std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL); } void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) { @@ -5784,7 +5834,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) { TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType()); EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), Mask.getValueType().getVectorElementType(), - WideVecVT.getVectorNumElements()); + WideVecVT.getVectorElementCount()); SDValue WideVec = ModifyToType(Vec, WideVecVT); SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1e9da9b819bdd..8836674999a0f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1535,6 +1535,23 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + // We can lower types that have elements to svcompact and + // legal i8/i16 types via a compressing store. + for (auto VT : + {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, + MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32, + MVT::nxv8i8, MVT::nxv8i16, MVT::nxv16i8}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + + // If we have SVE, we can use SVE logic for legal (or smaller than legal) + // NEON vectors in the lowest bits of the SVE register. + if (Subtarget->hasSVE()) + for (auto VT : {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32, + MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, + MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, + MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + // NEON doesn't support masked loads/stores, but SME and SVE do. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, @@ -6615,6 +6632,131 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, return DAG.getMergeValues({Ext, Chain}, DL); } +SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vec = Op.getOperand(0); + SDValue Mask = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT MaskVT = Mask.getValueType(); + EVT ElmtVT = VecVT.getVectorElementType(); + const bool IsFixedLength = VecVT.isFixedLengthVector(); + const bool HasPassthru = !Passthru.isUndef(); + unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue(); + EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts); + + assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector."); + + if (!Subtarget->hasSVE()) + return SDValue(); + + if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) + return SDValue(); + + // We can use the SVE register containing the NEON vector in its lowest bits. + if (IsFixedLength) { + EVT ScalableVecVT = + MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts); + EVT ScalableMaskVT = MVT::getScalableVectorVT( + MaskVT.getVectorElementType().getSimpleVT(), MinElmts); + + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Vec, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT, + DAG.getUNDEF(ScalableMaskVT), Mask, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::TRUNCATE, DL, + ScalableMaskVT.changeVectorElementType(MVT::i1), Mask); + Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Passthru, + DAG.getConstant(0, DL, MVT::i64)); + + VecVT = Vec.getValueType(); + MaskVT = Mask.getValueType(); + } + + // Special case where we can't use svcompact but can do a compressing store + // and then reload the vector. + if (VecVT == MVT::nxv8i8 || VecVT == MVT::nxv16i8 || VecVT == MVT::nxv8i16) { + SDValue StackPtr = DAG.CreateStackTemporary(VecVT); + int FI = cast(StackPtr.getNode())->getIndex(); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + PtrInfo, MachineMemOperand::Flags::MOStore, + LocationSize::precise(VecVT.getStoreSize()), + DAG.getReducedAlign(VecVT, /*UseABI=*/false)); + + SDValue Chain = DAG.getEntryNode(); + if (HasPassthru) + Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo); + + Chain = DAG.getMaskedStore(Chain, DL, Vec, StackPtr, DAG.getUNDEF(MVT::i64), + Mask, VecVT, MMO, ISD::UNINDEXED, /*IsTruncating=*/false, /*IsCompressing=*/true); + + SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); + + if (IsFixedLength) + Compressed = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVecVT, + Compressed, DAG.getConstant(0, DL, MVT::i64)); + + return Compressed; + } + + // Only supported for svcompact. + if (MinElmts != 2 && MinElmts != 4) + return SDValue(); + + // Get legal type for svcompact instruction + EVT ContainerVT = getSVEContainerType(VecVT); + EVT CastVT = VecVT.changeVectorElementTypeToInteger(); + + // Convert to i32 or i64 for smaller types, as these are the only supported + // sizes for svcompact. + if (ContainerVT != VecVT) { + Vec = DAG.getBitcast(CastVT, Vec); + Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec); + } + + SDValue Compressed = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), + DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); + + // svcompact fills with 0s, so if our passthru is all 0s, do nothing here. + if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { + SDValue Offset = DAG.getNode( + ISD::ZERO_EXTEND, DL, MaskVT.changeVectorElementType(MVT::i32), Mask); + Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, Offset); + Compressed = + DAG.getNode(ISD::VP_MERGE, DL, VecVT, + DAG.getSplatVector(MaskVT, DL, + DAG.getAllOnesConstant( + DL, MaskVT.getVectorElementType())), + Compressed, Passthru, Offset); + } + + // Extracting from a legal SVE type before truncating produces better code. + if (IsFixedLength) { + Compressed = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, + FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()), + Compressed, DAG.getConstant(0, DL, MVT::i64)); + CastVT = FixedVecVT.changeVectorElementTypeToInteger(); + VecVT = FixedVecVT; + } + + // If we changed the element type before, we need to convert it back. + if (ContainerVT != VecVT) { + Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed); + Compressed = DAG.getBitcast(VecVT, Compressed); + } + + return Compressed; +} + // Generate SUBS and CSEL for integer abs. SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); @@ -6995,6 +7137,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::VSCALE: return LowerVSCALE(Op, DAG); + case ISD::VECTOR_COMPRESS: + return LowerVECTOR_COMPRESS(Op, DAG); case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -26214,6 +26358,10 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; + case ISD::VECTOR_COMPRESS: + if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG)) + Results.push_back(Res); + return; case ISD::ADD: case ISD::FADD: ReplaceAddWithADDP(N, Results, DAG, Subtarget); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 81e15185f985d..517b1ba1fd400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1073,6 +1073,8 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; From a01b7789dc74fc61094f40a33a2ac505348ec463 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 15:27:44 +0200 Subject: [PATCH 2/9] Fix typo --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f8981255f8dd6..b42a54a56cfed 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2409,11 +2409,11 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, // This is not "trivial", as there is a dependency between the two subvectors. // Depending on the number of 1s in the mask, the elements from the Hi vector // need to be moved to the Lo vector. Passthru values make this even harder. - // We try to use MASKED_COMPRESS if the target has custom lowering with + // We try to use VECTOR_COMPRESS if the target has custom lowering with // smaller types and passthru is undef, as it is most likely faster than the // fully expand path. Otherwise, just do the full expansion as one "big" // operation and then extract the Lo and Hi vectors from that. This gets - // rid of MASKED_COMPRESS and all other operands can be legalized later. + // rid of VECTOR_COMPRESS and all other operands can be legalized later. SDLoc DL(N); EVT VecVT = N->getValueType(0); From 7c11026f1adb53c86f0590b1d022e8d7c35cd239 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 15:30:02 +0200 Subject: [PATCH 3/9] Fix formatting --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8836674999a0f..b76641e9df8bf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1546,10 +1546,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // If we have SVE, we can use SVE logic for legal (or smaller than legal) // NEON vectors in the lowest bits of the SVE register. if (Subtarget->hasSVE()) - for (auto VT : {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32, - MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, - MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, - MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8}) + for (auto VT : + {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32, + MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, + MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, + MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); // NEON doesn't support masked loads/stores, but SME and SVE do. @@ -6695,7 +6696,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo); Chain = DAG.getMaskedStore(Chain, DL, Vec, StackPtr, DAG.getUNDEF(MVT::i64), - Mask, VecVT, MMO, ISD::UNINDEXED, /*IsTruncating=*/false, /*IsCompressing=*/true); + Mask, VecVT, MMO, ISD::UNINDEXED, + /*IsTruncating=*/false, /*IsCompressing=*/true); SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); From 76a15b21e889af3e8db1ad065f73c0a8731ba90b Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 15:54:39 +0200 Subject: [PATCH 4/9] Add tests --- .../CodeGen/AArch64/sve-vector-compress.ll | 500 ++++++++++++++++++ 1 file changed, 500 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sve-vector-compress.ll diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll new file mode 100644 index 0000000000000..ea9a77c11c53a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -0,0 +1,500 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s + +define @test_compress_nxv2i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i16( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2i64( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2f32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv2f64( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i16( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4i32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv4f32( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv8i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ld1b { z0.h }, p1/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv8i16( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_nxv16i8( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_illegal_element_type( %vec, %mask) { +; CHECK-LABEL: test_compress_illegal_element_type: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_large( %vec, %mask) { +; CHECK-LABEL: test_compress_large: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: ptrue p2.s +; CHECK-NEXT: sub x9, x9, #1 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z0.s }, p1, [sp] +; CHECK-NEXT: cntp x8, p2, p1.s +; CHECK-NEXT: and x8, x8, #0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: st1w { z1.s }, p0, [x9, x8, lsl #2] +; CHECK-NEXT: ld1w { z0.s }, p2/z, [sp] +; CHECK-NEXT: ld1w { z1.s }, p2/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + +define @test_compress_very_large( %vec, %mask) { +; CHECK-LABEL: test_compress_very_large: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-8 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p4.b +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: mov x11, sp +; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: cntp x8, p4, p0.b +; CHECK-NEXT: cntp x9, p4, p2.b +; CHECK-NEXT: eor p0.b, p4/z, p0.b, p1.b +; CHECK-NEXT: and x8, x8, #0xffffffff +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: and x9, x9, #0xffffffff +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: st1b { z1.b }, p1, [x11, x8] +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: st1b { z2.b }, p2, [sp, #2, mul vl] +; CHECK-NEXT: addvl x10, sp, #4 +; CHECK-NEXT: st1b { z3.b }, p3, [x8, x9] +; CHECK-NEXT: cntp x8, p4, p0.b +; CHECK-NEXT: rdvl x9, #4 +; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #1, mul vl] +; CHECK-NEXT: sub x9, x9, #1 +; CHECK-NEXT: st1b { z0.b }, p4, [sp, #5, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp] +; CHECK-NEXT: and x8, x8, #0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: st1b { z0.b }, p4, [sp, #4, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #2, mul vl] +; CHECK-NEXT: st1b { z0.b }, p4, [x10, x8] +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #3, mul vl] +; CHECK-NEXT: st1b { z0.b }, p4, [x8, #1, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #4, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p4/z, [sp, #5, mul vl] +; CHECK-NEXT: ld1b { z2.b }, p4/z, [sp, #6, mul vl] +; CHECK-NEXT: ld1b { z3.b }, p4/z, [sp, #7, mul vl] +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + ret %out +} + + +; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying +; the second vector input register to the ret register or doing nothing. +define @test_compress_const_splat1_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_const_splat1_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, splat (i1 -1), undef) + ret %out +} +define @test_compress_const_splat0_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_const_splat0_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, splat (i1 0), undef) + ret %out +} +define @test_compress_undef_mask( %ignore, %vec) { +; CHECK-LABEL: test_compress_undef_mask: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, undef, undef) + ret %out +} + +define void @test_combine_compress_store_nxv16i8( %vec, %mask, ptr %ptr) { +; CHECK-LABEL: test_combine_compress_store_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + store %out, ptr %ptr + ret void +} + +define void @test_combine_compress_store_nxv4i32( %vec, %mask, ptr %ptr) { +; CHECK-LABEL: test_combine_compress_store_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) + store %out, ptr %ptr + ret void +} + + +define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_v4i32_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %out +} + +define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) { +; CHECK-LABEL: test_compress_v1i32_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: shl v1.2s, v1.2s, #31 +; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> undef) + ret <1 x i32> %out +} + +define <8 x i16> @test_compress_v8i16_with_sve(<8 x i16> %vec, <8 x i1> %mask) { +; CHECK-LABEL: test_compress_v8i16_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.8h, v1.8h, #15 +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: and z1.h, z1.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 +; CHECK-NEXT: st1h { z0.h }, p1, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> undef) + ret <8 x i16> %out +} + +define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compress_v4f64_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ushll v3.2d, v2.2s, #0 +; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: shl v3.2d, v3.2d, #63 +; CHECK-NEXT: shl v4.2d, v4.2d, #63 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 +; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 +; CHECK-NEXT: and x8, x8, #0x3 +; CHECK-NEXT: and z3.d, z3.d, #0x1 +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [x10] +; CHECK-NEXT: st1d { z1.d }, p0, [x10, x8, lsl #3] +; CHECK-NEXT: ldp q0, q1, [sp], #32 +; CHECK-NEXT: ret + %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef) + ret <4 x double> %out +} + +define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) { +; CHECK-LABEL: test_compress_v2i16_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: and z1.d, z1.d, #0x1 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret + %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> undef) + ret <2 x i16> %out +} + +define void @test_combine_compress_store_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask, ptr %ptr) { +; CHECK-LABEL: test_combine_compress_store_v4i32_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef) + store <4 x i32> %out, ptr %ptr + ret void +} + +define void @test_combine_compress_store_v16i8_with_sve(<16 x i8> %vec, <16 x i1> %mask, ptr %ptr) { +; CHECK-LABEL: test_combine_compress_store_v16i8_with_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: and z1.b, z1.b, #0x1 +; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef) + store <16 x i8> %out, ptr %ptr + ret void +} + +define @test_compress_nxv4i32_with_passthru( %vec, %mask, %passthru) { +; CHECK-LABEL: test_compress_nxv4i32_with_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.s, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: uaddv d2, p1, z2.s +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: cmphi p1.s, p1/z, z3.s, z2.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, %passthru) + ret %out +} + +define @test_compress_nxv4i32_with_zero_passthru( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32_with_zero_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i32 0)) + ret %out +} + +define @test_compress_nxv4i32_with_const_passthru( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: uaddv d1, p1, z1.s +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: cmphi p1.s, p1/z, z2.s, z1.s +; CHECK-NEXT: mov z1.s, #5 // =0x5 +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i32 5)) + ret %out +} + +define @test_compress_nxv16i8_with_passthru( %vec, %mask, %passthru) { +; CHECK-LABEL: test_compress_nxv16i8_with_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1b { z1.b }, p1, [sp] +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, %passthru) + ret %out +} + +define @test_compress_nxv16i8_with_const_passthru( %vec, %mask) { +; CHECK-LABEL: test_compress_nxv16i8_with_const_passthru: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov z1.b, #5 // =0x5 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: st1b { z1.b }, p1, [sp] +; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i8 5)) + ret %out +} From fb3759fc2c5c2ea84d0a3e99e11310ba0398af4e Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 16:02:38 +0200 Subject: [PATCH 5/9] Add combine for VECTOR_COMPRESS + store --- .../Target/AArch64/AArch64ISelLowering.cpp | 64 +++++++++++++++++++ .../CodeGen/AArch64/sve-vector-compress.ll | 30 ++++----- 2 files changed, 77 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b76641e9df8bf..6bfeb4d11ec42 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23074,6 +23074,67 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, return Chain; } +static SDValue combineVECTOR_COMPRESSStore(SelectionDAG &DAG, + StoreSDNode *Store, + const AArch64Subtarget *Subtarget) { + // If the regular store is preceded by an VECTOR_COMPRESS, we can combine them + // into a compressing store for scalable vectors in SVE. + SDValue VecOp = Store->getValue(); + EVT VecVT = VecOp.getValueType(); + if (VecOp.getOpcode() != ISD::VECTOR_COMPRESS || !Subtarget->hasSVE()) + return SDValue(); + + bool IsFixedLength = VecVT.isFixedLengthVector(); + if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) + return SDValue(); + + SDLoc DL(Store); + SDValue Vec = VecOp.getOperand(0); + SDValue Mask = VecOp.getOperand(1); + SDValue Passthru = VecOp.getOperand(2); + EVT MemVT = Store->getMemoryVT(); + MachineMemOperand *MMO = Store->getMemOperand(); + SDValue Chain = Store->getChain(); + + // We can use the SVE register containing the NEON vector in its lowest bits. + if (IsFixedLength) { + EVT ElmtVT = VecVT.getVectorElementType(); + unsigned NumElmts = VecVT.getVectorNumElements(); + EVT ScalableVecVT = + MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), NumElmts); + EVT ScalableMaskVT = MVT::getScalableVectorVT( + Mask.getValueType().getVectorElementType().getSimpleVT(), NumElmts); + + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Vec, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT, + DAG.getUNDEF(ScalableMaskVT), Mask, + DAG.getConstant(0, DL, MVT::i64)); + Mask = DAG.getNode(ISD::TRUNCATE, DL, + ScalableMaskVT.changeVectorElementType(MVT::i1), Mask); + Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, + DAG.getUNDEF(ScalableVecVT), Passthru, + DAG.getConstant(0, DL, MVT::i64)); + + MemVT = ScalableVecVT; + MMO->setType(LLT::scalable_vector(NumElmts, ElmtVT.getSizeInBits())); + } + + // If the passthru is all 0s, we don't need an explicit passthru store. + unsigned MinElmts = VecVT.getVectorMinNumElements(); + if (ISD::isConstantSplatVectorAllZeros(Passthru.getNode()) && (MinElmts == 2 || MinElmts == 4)) + return SDValue(); + + if (!Passthru.isUndef()) + Chain = DAG.getStore(Chain, DL, Passthru, Store->getBasePtr(), MMO); + + return DAG.getMaskedStore(Chain, DL, Vec, Store->getBasePtr(), + DAG.getUNDEF(MVT::i64), Mask, MemVT, MMO, + ISD::UNINDEXED, Store->isTruncatingStore(), + /*IsCompressing=*/true); +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -23118,6 +23179,9 @@ static SDValue performSTORECombine(SDNode *N, if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) return Store; + if (SDValue Store = combineVECTOR_COMPRESSStore(DAG, ST, Subtarget)) + return Store; + if (ST->isTruncatingStore()) { EVT StoreVT = ST->getMemoryVT(); if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT)) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index ea9a77c11c53a..cdebb0db47ceb 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -165,7 +165,7 @@ define @test_compress_large( %vec, @test_compress_very_large( %vec, @test_compress_very_large( %vec, @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) { define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) { ; CHECK-LABEL: test_compress_v1i32_with_sve: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sbfx w8, w0, #0, #1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: shl v1.2s, v1.2s, #31 -; CHECK-NEXT: cmlt v1.2s, v1.2s, #0 +; CHECK-NEXT: mov v1.s[0], w8 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -421,12 +421,10 @@ define void @test_combine_compress_store_v16i8_with_sve(<16 x i8> %vec, <16 x i1 define @test_compress_nxv4i32_with_passthru( %vec, %mask, %passthru) { ; CHECK-LABEL: test_compress_nxv4i32_with_passthru: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: compact z0.s, p0, z0.s -; CHECK-NEXT: uaddv d2, p1, z2.s -; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: cntp x8, p0, p0.s ; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: cmphi p1.s, p1/z, z3.s, z2.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s @@ -447,12 +445,10 @@ define @test_compress_nxv4i32_with_zero_passthru( @test_compress_nxv4i32_with_const_passthru( %vec, %mask) { ; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: compact z0.s, p0, z0.s -; CHECK-NEXT: uaddv d1, p1, z1.s -; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: cntp x8, p0, p0.s ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: cmphi p1.s, p1/z, z2.s, z1.s ; CHECK-NEXT: mov z1.s, #5 // =0x5 From e27fcd14cdbdf35ff75b5dc62ec875a6f879e8c5 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 29 Jul 2024 16:19:50 +0200 Subject: [PATCH 6/9] Fix formatting --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6bfeb4d11ec42..b495b0ebcd1e6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23123,7 +23123,8 @@ static SDValue combineVECTOR_COMPRESSStore(SelectionDAG &DAG, // If the passthru is all 0s, we don't need an explicit passthru store. unsigned MinElmts = VecVT.getVectorMinNumElements(); - if (ISD::isConstantSplatVectorAllZeros(Passthru.getNode()) && (MinElmts == 2 || MinElmts == 4)) + if (ISD::isConstantSplatVectorAllZeros(Passthru.getNode()) && + (MinElmts == 2 || MinElmts == 4)) return SDValue(); if (!Passthru.isUndef()) From f1daac6fa325c1e0f7ecf836022420956e8b8528 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Tue, 6 Aug 2024 09:11:48 +0200 Subject: [PATCH 7/9] Fix wrong compress store --- .../SelectionDAG/LegalizeVectorTypes.cpp | 8 +- .../Target/AArch64/AArch64ISelLowering.cpp | 141 ++-------- .../CodeGen/AArch64/sve-vector-compress.ll | 244 ++---------------- 3 files changed, 39 insertions(+), 354 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index b42a54a56cfed..81066b03776d7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2418,18 +2418,18 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, EVT VecVT = N->getValueType(0); auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT); - bool HasLegalOrCustom = false; + bool HasCustomLowering = false; EVT CheckVT = LoVT; while (CheckVT.getVectorMinNumElements() > 1) { - if (TLI.isOperationLegalOrCustom(ISD::VECTOR_COMPRESS, CheckVT)) { - HasLegalOrCustom = true; + if (TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) { + HasCustomLowering = true; break; } CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext()); } SDValue Passthru = N->getOperand(2); - if (!HasLegalOrCustom || !Passthru.isUndef()) { + if (!HasCustomLowering || !Passthru.isUndef()) { SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG); std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT); return; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b495b0ebcd1e6..55f02dc3f8290 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1535,24 +1535,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } - // We can lower types that have elements to svcompact and - // legal i8/i16 types via a compressing store. - for (auto VT : - {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, - MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32, - MVT::nxv8i8, MVT::nxv8i16, MVT::nxv16i8}) - setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); - - // If we have SVE, we can use SVE logic for legal (or smaller than legal) - // NEON vectors in the lowest bits of the SVE register. - if (Subtarget->hasSVE()) - for (auto VT : - {MVT::v1i8, MVT::v1i16, MVT::v1i32, MVT::v1i64, MVT::v1f32, - MVT::v1f64, MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, - MVT::v2f32, MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, - MVT::v4f32, MVT::v8i8, MVT::v8i16, MVT::v8i16, MVT::v16i8}) - setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); - // NEON doesn't support masked loads/stores, but SME and SVE do. for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, @@ -1792,6 +1774,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MVT::v2f32, MVT::v4f32, MVT::v2f64}) setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); + // We can lower types that have elements to compact. + for (auto VT : + {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, + MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + + // If we have SVE, we can use SVE logic for legal (or smaller than legal) + // NEON vectors in the lowest bits of the SVE register. + for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32, + MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + // Histcnt is SVE2 only if (Subtarget->hasSVE2()) setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other, @@ -6649,12 +6643,16 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector."); - if (!Subtarget->hasSVE()) + if (!Subtarget->isSVEAvailable()) return SDValue(); if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) return SDValue(); + // Only supported for compact. + if (MinElmts != 2 && MinElmts != 4) + return SDValue(); + // We can use the SVE register containing the NEON vector in its lowest bits. if (IsFixedLength) { EVT ScalableVecVT = @@ -6678,46 +6676,12 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, MaskVT = Mask.getValueType(); } - // Special case where we can't use svcompact but can do a compressing store - // and then reload the vector. - if (VecVT == MVT::nxv8i8 || VecVT == MVT::nxv16i8 || VecVT == MVT::nxv8i16) { - SDValue StackPtr = DAG.CreateStackTemporary(VecVT); - int FI = cast(StackPtr.getNode())->getIndex(); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); - - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - PtrInfo, MachineMemOperand::Flags::MOStore, - LocationSize::precise(VecVT.getStoreSize()), - DAG.getReducedAlign(VecVT, /*UseABI=*/false)); - - SDValue Chain = DAG.getEntryNode(); - if (HasPassthru) - Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo); - - Chain = DAG.getMaskedStore(Chain, DL, Vec, StackPtr, DAG.getUNDEF(MVT::i64), - Mask, VecVT, MMO, ISD::UNINDEXED, - /*IsTruncating=*/false, /*IsCompressing=*/true); - - SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); - - if (IsFixedLength) - Compressed = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVecVT, - Compressed, DAG.getConstant(0, DL, MVT::i64)); - - return Compressed; - } - - // Only supported for svcompact. - if (MinElmts != 2 && MinElmts != 4) - return SDValue(); - - // Get legal type for svcompact instruction + // Get legal type for compact instruction EVT ContainerVT = getSVEContainerType(VecVT); EVT CastVT = VecVT.changeVectorElementTypeToInteger(); // Convert to i32 or i64 for smaller types, as these are the only supported - // sizes for svcompact. + // sizes for compact. if (ContainerVT != VecVT) { Vec = DAG.getBitcast(CastVT, Vec); Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec); @@ -6727,7 +6691,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec); - // svcompact fills with 0s, so if our passthru is all 0s, do nothing here. + // compact fills with 0s, so if our passthru is all 0s, do nothing here. if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { SDValue Offset = DAG.getNode( ISD::ZERO_EXTEND, DL, MaskVT.changeVectorElementType(MVT::i32), Mask); @@ -23074,68 +23038,6 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, return Chain; } -static SDValue combineVECTOR_COMPRESSStore(SelectionDAG &DAG, - StoreSDNode *Store, - const AArch64Subtarget *Subtarget) { - // If the regular store is preceded by an VECTOR_COMPRESS, we can combine them - // into a compressing store for scalable vectors in SVE. - SDValue VecOp = Store->getValue(); - EVT VecVT = VecOp.getValueType(); - if (VecOp.getOpcode() != ISD::VECTOR_COMPRESS || !Subtarget->hasSVE()) - return SDValue(); - - bool IsFixedLength = VecVT.isFixedLengthVector(); - if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) - return SDValue(); - - SDLoc DL(Store); - SDValue Vec = VecOp.getOperand(0); - SDValue Mask = VecOp.getOperand(1); - SDValue Passthru = VecOp.getOperand(2); - EVT MemVT = Store->getMemoryVT(); - MachineMemOperand *MMO = Store->getMemOperand(); - SDValue Chain = Store->getChain(); - - // We can use the SVE register containing the NEON vector in its lowest bits. - if (IsFixedLength) { - EVT ElmtVT = VecVT.getVectorElementType(); - unsigned NumElmts = VecVT.getVectorNumElements(); - EVT ScalableVecVT = - MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), NumElmts); - EVT ScalableMaskVT = MVT::getScalableVectorVT( - Mask.getValueType().getVectorElementType().getSimpleVT(), NumElmts); - - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, - DAG.getUNDEF(ScalableVecVT), Vec, - DAG.getConstant(0, DL, MVT::i64)); - Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT, - DAG.getUNDEF(ScalableMaskVT), Mask, - DAG.getConstant(0, DL, MVT::i64)); - Mask = DAG.getNode(ISD::TRUNCATE, DL, - ScalableMaskVT.changeVectorElementType(MVT::i1), Mask); - Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, - DAG.getUNDEF(ScalableVecVT), Passthru, - DAG.getConstant(0, DL, MVT::i64)); - - MemVT = ScalableVecVT; - MMO->setType(LLT::scalable_vector(NumElmts, ElmtVT.getSizeInBits())); - } - - // If the passthru is all 0s, we don't need an explicit passthru store. - unsigned MinElmts = VecVT.getVectorMinNumElements(); - if (ISD::isConstantSplatVectorAllZeros(Passthru.getNode()) && - (MinElmts == 2 || MinElmts == 4)) - return SDValue(); - - if (!Passthru.isUndef()) - Chain = DAG.getStore(Chain, DL, Passthru, Store->getBasePtr(), MMO); - - return DAG.getMaskedStore(Chain, DL, Vec, Store->getBasePtr(), - DAG.getUNDEF(MVT::i64), Mask, MemVT, MMO, - ISD::UNINDEXED, Store->isTruncatingStore(), - /*IsCompressing=*/true); -} - static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -23180,9 +23082,6 @@ static SDValue performSTORECombine(SDNode *N, if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) return Store; - if (SDValue Store = combineVECTOR_COMPRESSStore(DAG, ST, Subtarget)) - return Store; - if (ST->isTruncatingStore()) { EVT StoreVT = ST->getMemoryVT(); if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT)) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index cdebb0db47ceb..8007158daee8a 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -91,57 +91,6 @@ define @test_compress_nxv4f32( %vec, %out } -define @test_compress_nxv8i8( %vec, %mask) { -; CHECK-LABEL: test_compress_nxv8i8: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: st1b { z0.h }, p0, [sp, #1, mul vl] -; CHECK-NEXT: ld1b { z0.h }, p1/z, [sp, #1, mul vl] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - ret %out -} - -define @test_compress_nxv8i16( %vec, %mask) { -; CHECK-LABEL: test_compress_nxv8i16: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: ld1h { z0.h }, p1/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - ret %out -} - -define @test_compress_nxv16i8( %vec, %mask) { -; CHECK-LABEL: test_compress_nxv16i8: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - ret %out -} - define @test_compress_illegal_element_type( %vec, %mask) { ; CHECK-LABEL: test_compress_illegal_element_type: ; CHECK: // %bb.0: @@ -158,20 +107,22 @@ define @test_compress_large( %vec, @test_compress_large( %vec, %out } -define @test_compress_very_large( %vec, %mask) { -; CHECK-LABEL: test_compress_very_large: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-8 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p4.b -; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: cntp x8, p4, p0.b -; CHECK-NEXT: cntp x9, p4, p2.b -; CHECK-NEXT: eor p0.b, p4/z, p0.b, p1.b -; CHECK-NEXT: mov w8, w8 -; CHECK-NEXT: cmp x8, x10 -; CHECK-NEXT: mov w9, w9 -; CHECK-NEXT: csel x8, x8, x10, lo -; CHECK-NEXT: cmp x9, x10 -; CHECK-NEXT: st1b { z1.b }, p1, [x11, x8] -; CHECK-NEXT: addvl x8, sp, #2 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: st1b { z2.b }, p2, [sp, #2, mul vl] -; CHECK-NEXT: addvl x10, sp, #4 -; CHECK-NEXT: st1b { z3.b }, p3, [x8, x9] -; CHECK-NEXT: cntp x8, p4, p0.b -; CHECK-NEXT: rdvl x9, #4 -; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #1, mul vl] -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: st1b { z0.b }, p4, [sp, #5, mul vl] -; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp] -; CHECK-NEXT: mov w8, w8 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: st1b { z0.b }, p4, [sp, #4, mul vl] -; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #2, mul vl] -; CHECK-NEXT: st1b { z0.b }, p4, [x10, x8] -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #3, mul vl] -; CHECK-NEXT: st1b { z0.b }, p4, [x8, #1, mul vl] -; CHECK-NEXT: ld1b { z0.b }, p4/z, [sp, #4, mul vl] -; CHECK-NEXT: ld1b { z1.b }, p4/z, [sp, #5, mul vl] -; CHECK-NEXT: ld1b { z2.b }, p4/z, [sp, #6, mul vl] -; CHECK-NEXT: ld1b { z3.b }, p4/z, [sp, #7, mul vl] -; CHECK-NEXT: addvl sp, sp, #8 -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - ret %out -} - - ; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying ; the second vector input register to the ret register or doing nothing. define @test_compress_const_splat1_mask( %ignore, %vec) { @@ -261,27 +155,6 @@ define @test_compress_undef_mask( %ignore, ret %out } -define void @test_combine_compress_store_nxv16i8( %vec, %mask, ptr %ptr) { -; CHECK-LABEL: test_combine_compress_store_nxv16i8: -; CHECK: // %bb.0: -; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - store %out, ptr %ptr - ret void -} - -define void @test_combine_compress_store_nxv4i32( %vec, %mask, ptr %ptr) { -; CHECK-LABEL: test_combine_compress_store_nxv4i32: -; CHECK: // %bb.0: -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, undef) - store %out, ptr %ptr - ret void -} - - define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) { ; CHECK-LABEL: test_compress_v4i32_with_sve: ; CHECK: // %bb.0: @@ -317,30 +190,6 @@ define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) { ret <1 x i32> %out } -define <8 x i16> @test_compress_v8i16_with_sve(<8 x i16> %vec, <8 x i1> %mask) { -; CHECK-LABEL: test_compress_v8i16_with_sve: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: shl v1.8h, v1.8h, #15 -; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 -; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 -; CHECK-NEXT: st1h { z0.h }, p1, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call <8 x i16> @llvm.experimental.vector.compress(<8 x i16> %vec, <8 x i1> %mask, <8 x i16> undef) - ret <8 x i16> %out -} - define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) { ; CHECK-LABEL: test_compress_v4f64_with_sve: ; CHECK: // %bb.0: @@ -348,9 +197,8 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: ushll v3.2d, v2.2s, #0 ; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 ; CHECK-NEXT: fmov x8, d2 @@ -358,15 +206,19 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m ; CHECK-NEXT: shl v4.2d, v4.2d, #63 ; CHECK-NEXT: lsr x9, x8, #32 ; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 ; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 ; CHECK-NEXT: and x8, x8, #0x3 +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: and z3.d, z3.d, #0x1 ; CHECK-NEXT: and z4.d, z4.d, #0x1 ; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 -; CHECK-NEXT: st1d { z0.d }, p1, [x10] -; CHECK-NEXT: st1d { z1.d }, p0, [x10, x8, lsl #3] +; CHECK-NEXT: compact z0.d, p1, z0.d +; CHECK-NEXT: compact z1.d, p0, z1.d +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ldp q0, q1, [sp], #32 ; CHECK-NEXT: ret %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef) @@ -388,35 +240,6 @@ define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) { ret <2 x i16> %out } -define void @test_combine_compress_store_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask, ptr %ptr) { -; CHECK-LABEL: test_combine_compress_store_v4i32_with_sve: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 -; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: ret - %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef) - store <4 x i32> %out, ptr %ptr - ret void -} - -define void @test_combine_compress_store_v16i8_with_sve(<16 x i8> %vec, <16 x i1> %mask, ptr %ptr) { -; CHECK-LABEL: test_combine_compress_store_v16i8_with_sve: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 -; CHECK-NEXT: st1b { z0.b }, p0, [x0] -; CHECK-NEXT: ret - %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef) - store <16 x i8> %out, ptr %ptr - ret void -} define @test_compress_nxv4i32_with_passthru( %vec, %mask, %passthru) { ; CHECK-LABEL: test_compress_nxv4i32_with_passthru: @@ -457,40 +280,3 @@ define @test_compress_nxv4i32_with_const_passthru( @llvm.experimental.vector.compress( %vec, %mask, splat(i32 5)) ret %out } - -define @test_compress_nxv16i8_with_passthru( %vec, %mask, %passthru) { -; CHECK-LABEL: test_compress_nxv16i8_with_passthru: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1b { z1.b }, p1, [sp] -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, %passthru) - ret %out -} - -define @test_compress_nxv16i8_with_const_passthru( %vec, %mask) { -; CHECK-LABEL: test_compress_nxv16i8_with_const_passthru: -; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: mov z1.b, #5 // =0x5 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: st1b { z1.b }, p1, [sp] -; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret - %out = call @llvm.experimental.vector.compress( %vec, %mask, splat(i8 5)) - ret %out -} From 78f8ad7767117009a14173a868a797760113f5be Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Wed, 7 Aug 2024 12:13:47 +0200 Subject: [PATCH 8/9] Use WHILELO for passthru merge --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 16 +++++++++------- llvm/test/CodeGen/AArch64/sve-vector-compress.ll | 14 ++++---------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ea1fabbca3f12..9479f8607e3a5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6695,14 +6695,16 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, // compact fills with 0s, so if our passthru is all 0s, do nothing here. if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { SDValue Offset = DAG.getNode( - ISD::ZERO_EXTEND, DL, MaskVT.changeVectorElementType(MVT::i32), Mask); - Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, Offset); + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, + DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask); + + SDValue IndexMask = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, + DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), + DAG.getConstant(0, DL, MVT::i64), Offset); + Compressed = - DAG.getNode(ISD::VP_MERGE, DL, VecVT, - DAG.getSplatVector(MaskVT, DL, - DAG.getAllOnesConstant( - DL, MaskVT.getVectorElementType())), - Compressed, Passthru, Offset); + DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru); } // Extracting from a legal SVE type before truncating produces better code. diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index 8007158daee8a..84c15e4fbc33c 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -245,12 +245,9 @@ define @test_compress_nxv4i32_with_passthru( @llvm.experimental.vector.compress( %vec, %mask, %passthru) ret %out @@ -269,13 +266,10 @@ define @test_compress_nxv4i32_with_const_passthru( @llvm.experimental.vector.compress( %vec, %mask, splat(i32 5)) ret %out From c9af6f2c10c5579580189b46f8d094ab1f4d5210 Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Mon, 12 Aug 2024 15:56:29 +0200 Subject: [PATCH 9/9] Add check for legal compress --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index eea0c178bf810..ad19e7fd876b2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2425,7 +2425,10 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, bool HasCustomLowering = false; EVT CheckVT = LoVT; while (CheckVT.getVectorMinNumElements() > 1) { - if (TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) { + // TLI.isOperationLegalOrCustom requires a legal type, but we could have a + // custom lowering for illegal types. So we do the checks separately. + if (TLI.isOperationLegal(ISD::VECTOR_COMPRESS, CheckVT) || + TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) { HasCustomLowering = true; break; }