From a27a811233d6248a95d830c4ea6b6370c1305d7b Mon Sep 17 00:00:00 2001 From: James Chesterman Date: Fri, 28 Feb 2025 17:10:50 +0000 Subject: [PATCH 01/12] [SelectionDAG] Improve type legalisation for PARTIAL_REDUCE_MLA Implement proper splitting functions for PARTIAL_REDUCE_MLA ISD nodes. This makes the udot_8to64 and sdot_8to64 tests generate dot product instructions for when the new ISD nodes are used. --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 +++++ .../SelectionDAG/LegalizeVectorTypes.cpp | 26 ++++++++++++++++--- .../AArch64/sve-partial-reduce-dot-product.ll | 4 +++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index abe261728a3e6..7b0e15f951681 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1668,6 +1668,12 @@ class TargetLoweringBase { return Action == Legal || Action == Custom; } + /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is + /// legal for this target. + bool isPartialReduceMLALegal(EVT AccVT, EVT InputVT) const { + return getPartialReduceMLAAction(AccVT, InputVT) == Legal; + } + /// If the action for this operation is to promote, this method returns the /// ValueType to promote to. MVT getTypeToPromoteTo(unsigned Op, MVT VT) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index a01e1cff74564..d0ae436a8758f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3220,8 +3220,26 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc DL(N); - SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); - std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); + SDValue Acc = N->getOperand(0); + SDValue Input1 = N->getOperand(1); + + // If the node has not gone through the DAG combine, then do not attempt to + // legalise, just expand. + if (!TLI.isPartialReduceMLALegal(Acc.getValueType(), Input1.getValueType())) { + SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); + return; + } + + SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi; + std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL); + std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); + std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL); + unsigned Opcode = N->getOpcode(); + EVT ResultVT = AccLo.getValueType(); + + Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo); + Hi = DAG.getNode(Opcode, DL, ResultVT, AccHi, Input1Hi, Input2Hi); } void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { @@ -4501,7 +4519,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { } SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { - return TLI.expandPartialReduceMLA(N, DAG); + SDValue Lo, Hi; + SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index ed27f40aba774..71936b686be15 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -259,6 +259,8 @@ define @udot_8to64( %acc, %a to @@ -293,6 +295,8 @@ define @sdot_8to64( %acc, Date: Tue, 15 Apr 2025 13:12:49 +0100 Subject: [PATCH 02/12] Explicitly set PartialReduceMLAActions --- .../Target/AArch64/AArch64ISelLowering.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 447794cc2b744..810d42635e7b2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1604,6 +1604,26 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Custom); } + if (EnablePartialReduceNodes) { + for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { + for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) { + // 1. Set all combinations where a type is illegal to "Legal" + // - These will be legalized to a legal type pair + // - Avoid expanding them too early (or preventing folds) + if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) { + setPartialReduceMLAAction(VT, InnerVT, Legal); + continue; + } + // 2. Set all legal combinations to "Expand" + // - Not all of these can be lowered (via a Legal or Custom lowering). + setPartialReduceMLAAction(VT, InnerVT, Expand); + } + } + // 3. Mark known legal pairs as 'Legal' (these will expand to USDOT). + setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal); + setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal); + } + // Firstly, exclude all scalable vector extending loads/truncating stores, // include both integer and floating scalable vector. for (MVT VT : MVT::scalable_vector_valuetypes()) { @@ -1856,6 +1876,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Other pairs will default to 'Expand'. setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal); + + setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i64, Custom); + setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i32, Custom); + + setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i64, Custom); + setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i32, Custom); + setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i16, Custom); + setPartialReduceMLAAction(MVT::nxv16i8, MVT::nxv32i8, Custom); } // Handle operations that are only available in non-streaming SVE mode. From 2ff7e91f9374a90f5146a5321c9ae06e15d3cb8d Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 24 Apr 2025 16:27:21 +0100 Subject: [PATCH 03/12] Re-generate tests --- .../AArch64/sve-partial-reduce-dot-product.ll | 292 +++++++++--------- .../AArch64/sve-partial-reduce-wide-add.ll | 92 ++++-- 2 files changed, 214 insertions(+), 170 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 71936b686be15..e36c56b7487ee 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -214,53 +214,51 @@ define @udot_8to64( %acc, %a to @@ -292,47 +290,45 @@ define @sdot_8to64( %acc, @usdot_8to64( %acc, @sudot_8to64( %acc, @udot_no_bin_op_8to64( %acc, %a to @@ -773,27 +769,27 @@ define @sdot_no_bin_op_8to64( %acc, %a to diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index 11fb60ead4fb2..602aa9df33b08 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -203,17 +203,41 @@ entry: } define @signed_wide_add_nxv8i32( %acc, %input){ -; CHECK-LABEL: signed_wide_add_nxv8i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sunpkhi z4.d, z2.s -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpkhi z5.d, z3.s -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: add z0.d, z0.d, z2.d -; CHECK-NEXT: add z1.d, z1.d, z4.d -; CHECK-NEXT: add z0.d, z3.d, z0.d -; CHECK-NEXT: add z1.d, z5.d, z1.d -; CHECK-NEXT: ret +; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: sunpkhi z4.d, z2.s +; CHECK-SVE2-NEXT: sunpklo z2.d, z2.s +; CHECK-SVE2-NEXT: sunpkhi z5.d, z3.s +; CHECK-SVE2-NEXT: sunpklo z3.d, z3.s +; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d +; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d +; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d +; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d +; CHECK-SVE2-NEXT: ret +; +; CHECK-SVE-LABEL: signed_wide_add_nxv8i32: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: sunpkhi z4.d, z2.s +; CHECK-SVE-NEXT: sunpklo z2.d, z2.s +; CHECK-SVE-NEXT: sunpkhi z5.d, z3.s +; CHECK-SVE-NEXT: sunpklo z3.d, z3.s +; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d +; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d +; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d +; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d +; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i32: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = sext %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( %acc, %input.wide) @@ -221,17 +245,41 @@ entry: } define @unsigned_wide_add_nxv8i32( %acc, %input){ -; CHECK-LABEL: unsigned_wide_add_nxv8i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uunpkhi z4.d, z2.s -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uunpkhi z5.d, z3.s -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: add z0.d, z0.d, z2.d -; CHECK-NEXT: add z1.d, z1.d, z4.d -; CHECK-NEXT: add z0.d, z3.d, z0.d -; CHECK-NEXT: add z1.d, z5.d, z1.d -; CHECK-NEXT: ret +; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: uunpkhi z4.d, z2.s +; CHECK-SVE2-NEXT: uunpklo z2.d, z2.s +; CHECK-SVE2-NEXT: uunpkhi z5.d, z3.s +; CHECK-SVE2-NEXT: uunpklo z3.d, z3.s +; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d +; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d +; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d +; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d +; CHECK-SVE2-NEXT: ret +; +; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32: +; CHECK-SVE: // %bb.0: // %entry +; CHECK-SVE-NEXT: uunpkhi z4.d, z2.s +; CHECK-SVE-NEXT: uunpklo z2.d, z2.s +; CHECK-SVE-NEXT: uunpkhi z5.d, z3.s +; CHECK-SVE-NEXT: uunpklo z3.d, z3.s +; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d +; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d +; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d +; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d +; CHECK-SVE-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i32: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d +; CHECK-NEWLOWERING-NEXT: ret entry: %input.wide = zext %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( %acc, %input.wide) From 76d2744f6a5f58ef00c6d0d00e4993d1db9d6ecc Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 24 Apr 2025 16:30:20 +0100 Subject: [PATCH 04/12] Remove erroneous setPartialReduceMLAAction calls --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 810d42635e7b2..bbaa690e55980 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1876,14 +1876,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Other pairs will default to 'Expand'. setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal); - - setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i64, Custom); - setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i32, Custom); - - setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i64, Custom); - setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i32, Custom); - setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i16, Custom); - setPartialReduceMLAAction(MVT::nxv16i8, MVT::nxv32i8, Custom); } // Handle operations that are only available in non-streaming SVE mode. From b36ca2bb7d7090eab042ccc2cd229a2299b7a27b Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 24 Apr 2025 16:36:26 +0100 Subject: [PATCH 05/12] Remove dead/duplicate code --- .../Target/AArch64/AArch64ISelLowering.cpp | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bbaa690e55980..447794cc2b744 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1604,26 +1604,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Custom); } - if (EnablePartialReduceNodes) { - for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { - for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) { - // 1. Set all combinations where a type is illegal to "Legal" - // - These will be legalized to a legal type pair - // - Avoid expanding them too early (or preventing folds) - if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) { - setPartialReduceMLAAction(VT, InnerVT, Legal); - continue; - } - // 2. Set all legal combinations to "Expand" - // - Not all of these can be lowered (via a Legal or Custom lowering). - setPartialReduceMLAAction(VT, InnerVT, Expand); - } - } - // 3. Mark known legal pairs as 'Legal' (these will expand to USDOT). - setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal); - setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal); - } - // Firstly, exclude all scalable vector extending loads/truncating stores, // include both integer and floating scalable vector. for (MVT VT : MVT::scalable_vector_valuetypes()) { From 6c782ea7b2f90aa6f2fa70536dfcfe13e7297ef9 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Tue, 29 Apr 2025 14:43:06 +0100 Subject: [PATCH 06/12] Address comments and alter how PARTIAL_REDUCE_MLA operands are split --- .../SelectionDAG/LegalizeVectorTypes.cpp | 24 +- .../neon-partial-reduce-dot-product.ll | 344 +++++---- .../CodeGen/AArch64/partial-reduction-add.ll | 16 +- .../AArch64/sve-partial-reduce-dot-product.ll | 668 +++++++----------- .../AArch64/sve-partial-reduce-wide-add.ll | 116 +-- 5 files changed, 487 insertions(+), 681 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d0ae436a8758f..7d690ea2205d8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3223,14 +3223,6 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue Acc = N->getOperand(0); SDValue Input1 = N->getOperand(1); - // If the node has not gone through the DAG combine, then do not attempt to - // legalise, just expand. - if (!TLI.isPartialReduceMLALegal(Acc.getValueType(), Input1.getValueType())) { - SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); - std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); - return; - } - SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi; std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL); std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); @@ -4519,9 +4511,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { } SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { - SDValue Lo, Hi; - SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi); + + SDLoc DL(N); + SDValue Acc = N->getOperand(0); + SDValue Input1 = N->getOperand(1); + + SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi; + std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); + std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL); + unsigned Opcode = N->getOpcode(); + EVT ResultVT = Acc.getValueType(); + + auto Lo = DAG.getNode(Opcode, DL, ResultVT, Acc, Input1Lo, Input2Lo); + return DAG.getNode(Opcode, DL, ResultVT, Lo, Input1Hi, Input2Hi); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 9e305056abce2..ab9813aa796e3 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -14,11 +14,10 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b -; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h -; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h +; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v3.8h +; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NODOT-NEXT: ret %u.wide = zext <16 x i8> %u to <16 x i32> %s.wide = zext <16 x i8> %s to <16 x i32> @@ -50,18 +49,17 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){ ; CHECK-NODOT-NEXT: mov x8, xzr ; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NODOT-NEXT: ldr q0, [x0, x8] -; CHECK-NODOT-NEXT: ldr q2, [x1, x8] +; CHECK-NODOT-NEXT: ldr q2, [x0, x8] +; CHECK-NODOT-NEXT: ldr q3, [x1, x8] +; CHECK-NODOT-NEXT: mov v0.16b, v1.16b ; CHECK-NODOT-NEXT: add x8, x8, #16 +; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b +; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b ; CHECK-NODOT-NEXT: cmp x8, #16 -; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b -; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b -; CHECK-NODOT-NEXT: mov v0.16b, v1.16b -; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0 -; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h -; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h -; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h -; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v4.4h +; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v4.8h +; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h +; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h ; CHECK-NODOT-NEXT: b.ne .LBB1_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: ret @@ -98,14 +96,14 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h -; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s +; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret %u.wide = zext <8 x i8> %u to <8 x i32> %s.wide = zext <8 x i8> %s to <8 x i32> @@ -124,11 +122,10 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b -; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h -; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h +; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v3.8h +; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NODOT-NEXT: ret %u.wide = sext <16 x i8> %u to <16 x i32> %s.wide = sext <16 x i8> %s to <16 x i32> @@ -148,14 +145,14 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h -; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s +; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret %u.wide = sext <8 x i8> %u to <8 x i32> %s.wide = sext <8 x i8> %s to <8 x i32> @@ -168,14 +165,13 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-NOI8MM-LABEL: usdot: ; CHECK-NOI8MM: // %bb.0: ; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h -; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h +; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h ; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h -; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h -; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: usdot: @@ -196,20 +192,19 @@ define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){ ; CHECK-NOI8MM-NEXT: mov x8, xzr ; CHECK-NOI8MM-NEXT: .LBB6_1: // %vector.body ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8] -; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8] +; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8] +; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b ; CHECK-NOI8MM-NEXT: add x8, x8, #16 +; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 +; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 +; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: cmp x8, #16 -; CHECK-NOI8MM-NEXT: sshll v3.8h, v0.8b, #0 -; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v0.16b, #0 -; CHECK-NOI8MM-NEXT: ushll v5.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b -; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h -; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h -; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h -; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h -; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NOI8MM-NEXT: smlal v1.4s, v4.4h, v5.4h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h +; CHECK-NOI8MM-NEXT: smlal v1.4s, v2.4h, v3.4h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h ; CHECK-NOI8MM-NEXT: b.ne .LBB6_1 ; CHECK-NOI8MM-NEXT: // %bb.2: // %end ; CHECK-NOI8MM-NEXT: ret @@ -258,15 +253,15 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-NOI8MM-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h -; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h -; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 -; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NOI8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NOI8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NOI8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h ; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 -; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h -; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NOI8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v4.4h +; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: usdot_narrow: @@ -284,14 +279,13 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ ; CHECK-NOI8MM-LABEL: sudot: ; CHECK-NOI8MM: // %bb.0: ; CHECK-NOI8MM-NEXT: sshll v3.8h, v1.8b, #0 -; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0 ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h -; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h +; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h ; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h -; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h -; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: sudot: @@ -312,20 +306,19 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ ; CHECK-NOI8MM-NEXT: mov x8, xzr ; CHECK-NOI8MM-NEXT: .LBB9_1: // %vector.body ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8] -; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8] +; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8] +; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b ; CHECK-NOI8MM-NEXT: add x8, x8, #16 +; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: cmp x8, #16 -; CHECK-NOI8MM-NEXT: ushll v3.8h, v0.8b, #0 -; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v0.16b, #0 -; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b -; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h -; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h -; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h -; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h -; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s +; CHECK-NOI8MM-NEXT: smlal v1.4s, v4.4h, v5.4h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h +; CHECK-NOI8MM-NEXT: smlal v1.4s, v2.4h, v3.4h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h ; CHECK-NOI8MM-NEXT: b.ne .LBB9_1 ; CHECK-NOI8MM-NEXT: // %bb.2: // %end ; CHECK-NOI8MM-NEXT: ret @@ -374,15 +367,15 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ ; CHECK-NOI8MM-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h -; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h -; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 -; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 ; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h +; CHECK-NOI8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8 +; CHECK-NOI8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8 +; CHECK-NOI8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h ; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 -; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h -; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NOI8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s +; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v4.4h +; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: sudot_narrow: @@ -413,14 +406,14 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s -; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d -; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = zext <16 x i8> %a to <16 x i64> @@ -448,14 +441,14 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s -; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d -; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = sext <16 x i8> %a to <16 x i64> @@ -470,27 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NOI8MM-LABEL: usdot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v7.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v16.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v17.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0 -; CHECK-NOI8MM-NEXT: ushll2 v16.4s, v2.8h, #0 -; CHECK-NOI8MM-NEXT: sshll2 v17.4s, v3.8h, #0 -; CHECK-NOI8MM-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s -; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s -; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s -; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s -; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d -; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d +; CHECK-NOI8MM-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: usdot_8to64: @@ -513,27 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOI8MM-LABEL: sudot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v7.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v16.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v17.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0 -; CHECK-NOI8MM-NEXT: sshll2 v16.4s, v2.8h, #0 -; CHECK-NOI8MM-NEXT: ushll2 v17.4s, v3.8h, #0 -; CHECK-NOI8MM-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s -; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s -; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s -; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s -; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d -; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d +; CHECK-NOI8MM-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: sudot_8to64: @@ -563,11 +552,10 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0 ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h +; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v2.8h +; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NODOT-NEXT: ret %a.wide = zext <16 x i8> %a to <16 x i32> %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) @@ -597,17 +585,16 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ ; CHECK-NODOT-NEXT: mov x8, xzr ; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NODOT-NEXT: ldr q0, [x0, x8] +; CHECK-NODOT-NEXT: ldr q2, [x0, x8] +; CHECK-NODOT-NEXT: mov v0.16b, v1.16b ; CHECK-NODOT-NEXT: add x8, x8, #16 ; CHECK-NODOT-NEXT: cmp x8, #16 -; CHECK-NODOT-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0 -; CHECK-NODOT-NEXT: mov v0.16b, v1.16b -; CHECK-NODOT-NEXT: ushll v1.4s, v3.4h, #0 -; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v2.4h +; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 +; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v3.4h +; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h +; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h -; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v3.8h -; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NODOT-NEXT: b.ne .LBB16_1 ; CHECK-NODOT-NEXT: // %bb.2: // %end ; CHECK-NODOT-NEXT: ret @@ -641,11 +628,10 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ ; CHECK-NODOT: // %bb.0: ; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0 ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h -; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h +; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v2.8h +; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h -; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NODOT-NEXT: ret %a.wide = sext <16 x i8> %a to <16 x i32> %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) @@ -664,14 +650,14 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h -; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s +; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret %a.wide = zext <8 x i8> %a to <8 x i32> %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) @@ -690,14 +676,14 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h -; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s -; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h -; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s +; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 +; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h +; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s ; CHECK-NODOT-NEXT: ret %a.wide = sext <8 x i8> %a to <8 x i32> %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) @@ -722,14 +708,14 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s -; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d -; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s ; CHECK-NODOT-NEXT: ret %a.wide = zext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) @@ -754,14 +740,14 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s -; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d -; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s ; CHECK-NODOT-NEXT: ret %a.wide = sext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) @@ -808,11 +794,10 @@ define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: ushll v4.4s, v2.4h, #0 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s ; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s +; CHECK-NEXT: umlal2 v0.2d, v3.4s, v4.4s +; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s -; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s -; CHECK-NEXT: add v0.2d, v5.2d, v0.2d ; CHECK-NEXT: ret entry: %a.wide = zext <8 x i16> %a to <8 x i64> @@ -830,11 +815,10 @@ define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: sshll v4.4s, v2.4h, #0 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s +; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s -; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s -; CHECK-NEXT: add v0.2d, v5.2d, v0.2d ; CHECK-NEXT: ret entry: %a.wide = sext <8 x i16> %a to <8 x i64> @@ -852,11 +836,10 @@ define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> % ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: sshll v4.4s, v2.4h, #0 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s +; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s -; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s -; CHECK-NEXT: add v0.2d, v5.2d, v0.2d ; CHECK-NEXT: ret entry: %a.wide = zext <8 x i16> %a to <8 x i64> @@ -874,11 +857,10 @@ define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> % ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: ushll v4.4s, v2.4h, #0 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s +; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s -; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s -; CHECK-NEXT: add v0.2d, v5.2d, v0.2d ; CHECK-NEXT: ret entry: %a.wide = sext <8 x i16> %a to <8 x i64> @@ -897,26 +879,24 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { ; CHECK-NOI8MM-NEXT: .LBB28_1: // %vector.body ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8] -; CHECK-NOI8MM-NEXT: ldr q3, [x2, x8] -; CHECK-NOI8MM-NEXT: ldr q4, [x1, x8] +; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8] +; CHECK-NOI8MM-NEXT: ldr q4, [x2, x8] ; CHECK-NOI8MM-NEXT: add x8, x8, #16 ; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NOI8MM-NEXT: ushll v6.8h, v4.8b, #0 +; CHECK-NOI8MM-NEXT: sshll v7.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-NOI8MM-NEXT: ushll2 v6.8h, v3.16b, #0 -; CHECK-NOI8MM-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NOI8MM-NEXT: sshll v7.8h, v4.8b, #0 -; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v4.16b, #0 +; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v4.16b, #0 +; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: cmp x8, #1024 -; CHECK-NOI8MM-NEXT: smull v16.4s, v2.4h, v6.4h -; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v3.4h -; CHECK-NOI8MM-NEXT: smull v17.4s, v4.4h, v6.4h -; CHECK-NOI8MM-NEXT: smlal v1.4s, v7.4h, v3.4h -; CHECK-NOI8MM-NEXT: smlal2 v16.4s, v5.8h, v3.8h -; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v6.8h -; CHECK-NOI8MM-NEXT: smlal2 v17.4s, v7.8h, v3.8h -; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v6.8h -; CHECK-NOI8MM-NEXT: add v0.4s, v16.4s, v0.4s -; CHECK-NOI8MM-NEXT: add v1.4s, v17.4s, v1.4s +; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v6.4h +; CHECK-NOI8MM-NEXT: smlal v1.4s, v7.4h, v6.4h +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v5.8h, v6.8h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v7.8h, v6.8h +; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v4.4h +; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v4.4h +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v4.8h +; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v3.8h, v4.8h ; CHECK-NOI8MM-NEXT: b.ne .LBB28_1 ; CHECK-NOI8MM-NEXT: // %bb.2: // %end ; CHECK-NOI8MM-NEXT: add v0.4s, v1.4s, v0.4s diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index ae681ee54e687..c3828c3d695c4 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -18,7 +18,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32 ; CHECK-LABEL: partial_reduce_add_fixed_half: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) @@ -39,7 +39,7 @@ define @partial_reduce_add_half( %accumulat ; CHECK-LABEL: partial_reduce_add_half: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accumulator, %0) @@ -49,10 +49,10 @@ entry: define @partial_reduce_add_quart( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_quart: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: add z2.s, z2.s, z3.s -; CHECK-NEXT: add z0.s, z4.s, z0.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z0.s, z4.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accumulator, %0) @@ -63,9 +63,9 @@ define @partial_reduce_add_half_8( %accumul ; CHECK-LABEL: partial_reduce_add_half_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add z0.s, z0.s, z2.s -; CHECK-NEXT: add z1.s, z1.s, z3.s -; CHECK-NEXT: add z0.s, z4.s, z0.s -; CHECK-NEXT: add z1.s, z5.s, z1.s +; CHECK-NEXT: add z1.s, z1.s, z4.s +; CHECK-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEXT: add z1.s, z1.s, z5.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index e36c56b7487ee..1b754fc3d320e 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -85,46 +85,42 @@ define @usdot( %acc, %a, ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: uunpklo z3.h, z1.b ; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b +; CHECK-NOI8MM-NEXT: ptrue p0.s ; CHECK-NOI8MM-NEXT: uunpkhi z1.h, z1.b ; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b -; CHECK-NOI8MM-NEXT: ptrue p0.s ; CHECK-NOI8MM-NEXT: uunpklo z5.s, z3.h -; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h ; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h +; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h ; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h -; CHECK-NOI8MM-NEXT: uunpklo z7.s, z1.h +; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NOI8MM-NEXT: uunpklo z5.s, z1.h +; CHECK-NOI8MM-NEXT: sunpklo z6.s, z2.h ; CHECK-NOI8MM-NEXT: uunpkhi z1.s, z1.h -; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h ; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h +; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NOI8MM-NEXT: movprfx z1, z3 -; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s -; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: usdot: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b ; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b -; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h -; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h -; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s -; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to @@ -144,46 +140,42 @@ define @sudot( %acc, %a, ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: sunpklo z3.h, z1.b ; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b +; CHECK-NOI8MM-NEXT: ptrue p0.s ; CHECK-NOI8MM-NEXT: sunpkhi z1.h, z1.b ; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b -; CHECK-NOI8MM-NEXT: ptrue p0.s ; CHECK-NOI8MM-NEXT: sunpklo z5.s, z3.h -; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h ; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h +; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h ; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h -; CHECK-NOI8MM-NEXT: sunpklo z7.s, z1.h +; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NOI8MM-NEXT: sunpklo z5.s, z1.h +; CHECK-NOI8MM-NEXT: uunpklo z6.s, z2.h ; CHECK-NOI8MM-NEXT: sunpkhi z1.s, z1.h -; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h ; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h +; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s ; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NOI8MM-NEXT: movprfx z1, z3 -; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s -; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: sudot: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b ; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEWLOWERING-NEXT: ptrue p0.s ; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h -; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h -; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h -; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s -; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s ; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s -; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to @@ -206,18 +198,10 @@ define @udot_8to64( %acc, @udot_8to64( %acc, %a to @@ -282,18 +258,10 @@ define @sdot_8to64( %acc, @sdot_8to64( %acc, %a to @@ -358,75 +318,51 @@ define @usdot_8to64( %acc, @usdot_8to64( %acc, %a to @@ -491,75 +419,51 @@ define @sudot_8to64( %acc, @sudot_8to64( %acc, %a to @@ -620,16 +516,16 @@ define @udot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) @@ -645,16 +541,16 @@ define @sdot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) @@ -670,16 +566,16 @@ define @udot_no_bin_op_wide( %acc, %a to @@ -696,16 +592,16 @@ define @sdot_no_bin_op_wide( %acc, %a to @@ -729,26 +625,26 @@ define @udot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) @@ -771,26 +667,26 @@ define @sdot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) @@ -870,46 +766,42 @@ define @not_usdot( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: sunpklo z4.s, z2.h +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpkhi z1.s, z1.h ; CHECK-NEXT: sunpkhi z2.s, z2.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEXT: sunpklo z6.d, z4.s +; CHECK-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEXT: sunpkhi z4.d, z4.s -; CHECK-NEXT: uunpklo z7.d, z1.s +; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEXT: uunpklo z5.d, z1.s +; CHECK-NEXT: sunpklo z6.d, z2.s ; CHECK-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEXT: sunpklo z24.d, z2.s ; CHECK-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d ; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEXT: mul z3.d, z3.d, z4.d ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d -; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: not_usdot: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h ; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s -; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s ; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s ; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d -; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to @@ -924,46 +816,42 @@ define @not_sudot( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sunpklo z3.s, z1.h ; CHECK-NEXT: uunpklo z4.s, z2.h +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEXT: uunpklo z6.d, z4.s +; CHECK-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEXT: sunpklo z7.d, z1.s +; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEXT: sunpklo z5.d, z1.s +; CHECK-NEXT: uunpklo z6.d, z2.s ; CHECK-NEXT: sunpkhi z1.d, z1.s -; CHECK-NEXT: uunpklo z24.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d ; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEXT: mul z3.d, z3.d, z4.d ; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEXT: movprfx z1, z3 -; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d -; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret ; ; CHECK-NEWLOWERING-LABEL: not_sudot: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h ; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: ptrue p0.d ; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s ; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s -; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s ; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d -; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d ; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d -; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d -; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to @@ -978,48 +866,44 @@ define @udot_different_types( %acc, %a to @@ -1039,20 +923,18 @@ define @sdot_different_types( %acc, @sdot_different_types( %acc, %a to @@ -1097,20 +977,18 @@ define @usdot_different_types( %acc, @usdot_different_types( %acc, %a to @@ -1150,48 +1026,44 @@ define @sudot_different_types( %acc, %a to diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index 602aa9df33b08..5148d3da6c737 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -15,7 +15,7 @@ define @signed_wide_add_nxv4i32( %acc, @signed_wide_add_nxv4i32( %acc, %input to @@ -43,7 +43,7 @@ define @unsigned_wide_add_nxv4i32( %acc, @unsigned_wide_add_nxv4i32( %acc, %input to @@ -71,7 +71,7 @@ define @signed_wide_add_nxv8i16( %acc, @signed_wide_add_nxv8i16( %acc, %input to @@ -99,7 +99,7 @@ define @unsigned_wide_add_nxv8i16( %acc, @unsigned_wide_add_nxv8i16( %acc, %input to @@ -127,7 +127,7 @@ define @signed_wide_add_nxv16i8( %acc, @signed_wide_add_nxv16i8( %acc, %input to @@ -155,7 +155,7 @@ define @unsigned_wide_add_nxv16i8( %acc, @unsigned_wide_add_nxv16i8( %acc, %input to @@ -203,41 +203,17 @@ entry: } define @signed_wide_add_nxv8i32( %acc, %input){ -; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32: -; CHECK-SVE2: // %bb.0: // %entry -; CHECK-SVE2-NEXT: sunpkhi z4.d, z2.s -; CHECK-SVE2-NEXT: sunpklo z2.d, z2.s -; CHECK-SVE2-NEXT: sunpkhi z5.d, z3.s -; CHECK-SVE2-NEXT: sunpklo z3.d, z3.s -; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d -; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d -; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d -; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d -; CHECK-SVE2-NEXT: ret -; -; CHECK-SVE-LABEL: signed_wide_add_nxv8i32: -; CHECK-SVE: // %bb.0: // %entry -; CHECK-SVE-NEXT: sunpkhi z4.d, z2.s -; CHECK-SVE-NEXT: sunpklo z2.d, z2.s -; CHECK-SVE-NEXT: sunpkhi z5.d, z3.s -; CHECK-SVE-NEXT: sunpklo z3.d, z3.s -; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d -; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d -; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d -; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d -; CHECK-SVE-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i32: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z3.s -; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z2.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s -; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s -; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d -; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d -; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d -; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-LABEL: signed_wide_add_nxv8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sunpklo z4.d, z3.s +; CHECK-NEXT: sunpklo z5.d, z2.s +; CHECK-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEXT: add z0.d, z0.d, z5.d +; CHECK-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: add z1.d, z1.d, z3.d +; CHECK-NEXT: ret entry: %input.wide = sext %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( %acc, %input.wide) @@ -245,41 +221,17 @@ entry: } define @unsigned_wide_add_nxv8i32( %acc, %input){ -; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32: -; CHECK-SVE2: // %bb.0: // %entry -; CHECK-SVE2-NEXT: uunpkhi z4.d, z2.s -; CHECK-SVE2-NEXT: uunpklo z2.d, z2.s -; CHECK-SVE2-NEXT: uunpkhi z5.d, z3.s -; CHECK-SVE2-NEXT: uunpklo z3.d, z3.s -; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d -; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d -; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d -; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d -; CHECK-SVE2-NEXT: ret -; -; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32: -; CHECK-SVE: // %bb.0: // %entry -; CHECK-SVE-NEXT: uunpkhi z4.d, z2.s -; CHECK-SVE-NEXT: uunpklo z2.d, z2.s -; CHECK-SVE-NEXT: uunpkhi z5.d, z3.s -; CHECK-SVE-NEXT: uunpklo z3.d, z3.s -; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d -; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d -; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d -; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d -; CHECK-SVE-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i32: -; CHECK-NEWLOWERING: // %bb.0: // %entry -; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z2.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s -; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d -; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d -; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d -; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-LABEL: unsigned_wide_add_nxv8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uunpklo z4.d, z3.s +; CHECK-NEXT: uunpklo z5.d, z2.s +; CHECK-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: add z0.d, z0.d, z5.d +; CHECK-NEXT: add z1.d, z1.d, z4.d +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: add z1.d, z1.d, z3.d +; CHECK-NEXT: ret entry: %input.wide = zext %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( %acc, %input.wide) From 60b4ec6a1bbbab975d5473c70fd70cc022a7b6c2 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 30 Apr 2025 14:51:39 +0100 Subject: [PATCH 07/12] Split only the operands if the accumulator doesn't need it. --- .../SelectionDAG/LegalizeVectorTypes.cpp | 24 +- .../neon-partial-reduce-dot-product.ll | 100 ++--- .../CodeGen/AArch64/partial-reduction-add.ll | 4 +- .../AArch64/sve-partial-reduce-dot-product.ll | 400 +++++++++--------- .../AArch64/sve-partial-reduce-wide-add.ll | 20 +- 5 files changed, 282 insertions(+), 266 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 7d690ea2205d8..884a428ea9a60 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3222,12 +3222,24 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDLoc DL(N); SDValue Acc = N->getOperand(0); SDValue Input1 = N->getOperand(1); + SDValue Input2 = N->getOperand(2); - SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi; + SDValue AccLo, AccHi; std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL); - std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); - std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL); unsigned Opcode = N->getOpcode(); + + // If the input types don't need splitting, just accumulate into the + // low part of the accumulator. + if (getTypeAction(Input1.getValueType()) == TargetLowering::TypeSplitVector) { + Lo = DAG.getNode(Opcode, DL, AccLo.getValueType(), AccLo, Input1, Input2); + Hi = AccHi; + return; + } + + SDValue Input1Lo, Input1Hi; + SDValue Input2Lo, Input2Hi; + std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); + std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(Input2, DL); EVT ResultVT = AccLo.getValueType(); Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo); @@ -4512,9 +4524,13 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { - SDLoc DL(N); SDValue Acc = N->getOperand(0); + assert(getTypeAction(Acc.getValueType()) != TargetLowering::TypeSplitVector && + "Accumulator should already be a legal type, and shouldn't need " + "further splitting"); + SDValue Input1 = N->getOperand(1); + SDLoc DL(N); SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi; std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index ab9813aa796e3..06e9bc901ab36 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -402,18 +402,18 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0 -; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: ushll v5.4s, v4.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 +; CHECK-NODOT-NEXT: ushll v3.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v5.2s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v5.4s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = zext <16 x i8> %a to <16 x i64> @@ -437,18 +437,18 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0 -; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: sshll v5.4s, v4.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-NODOT-NEXT: sshll v3.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v5.2s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v5.4s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = sext <16 x i8> %a to <16 x i64> @@ -463,25 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NOI8MM-LABEL: usdot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v7.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v16.4s, v5.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v17.4s, v3.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NOI8MM-NEXT: ushll v4.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v5.4s, v3.4h, #0 +; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s -; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s ; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s ; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: usdot_8to64: @@ -504,25 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOI8MM-LABEL: sudot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 +; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v7.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v16.4s, v5.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v17.4s, v3.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s +; CHECK-NOI8MM-NEXT: sshll v4.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v5.4s, v3.4h, #0 +; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s -; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s ; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s ; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s +; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: sudot_8to64: @@ -705,17 +705,17 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0 -; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: ushll v4.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret %a.wide = zext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) @@ -737,17 +737,17 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0 -; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: sshll v4.4s, v2.4h, #0 +; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s -; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret %a.wide = sext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index c3828c3d695c4..3810374b18fbe 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -62,10 +62,10 @@ entry: define @partial_reduce_add_half_8( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_half_8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z3.s, z3.s, z4.s ; CHECK-NEXT: add z0.s, z0.s, z2.s -; CHECK-NEXT: add z1.s, z1.s, z4.s ; CHECK-NEXT: add z0.s, z0.s, z3.s -; CHECK-NEXT: add z1.s, z1.s, z5.s +; CHECK-NEXT: add z0.s, z0.s, z5.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 1b754fc3d320e..db3852a7c02b6 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -198,42 +198,42 @@ define @udot_8to64( %acc, @sdot_8to64( %acc, @usdot_8to64( %acc, @sudot_8to64( %acc, @udot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) @@ -667,26 +667,26 @@ define @sdot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index 5148d3da6c737..1fe8628357783 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -206,13 +206,13 @@ define @signed_wide_add_nxv8i32( %acc, %input to @@ -224,13 +224,13 @@ define @unsigned_wide_add_nxv8i32( %acc, %input to From 6693aa036cba885cf7bc399abe37a5f9225b7029 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 30 Apr 2025 14:51:53 +0100 Subject: [PATCH 08/12] Remove dead code --- llvm/include/llvm/CodeGen/TargetLowering.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 7b0e15f951681..abe261728a3e6 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1668,12 +1668,6 @@ class TargetLoweringBase { return Action == Legal || Action == Custom; } - /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is - /// legal for this target. - bool isPartialReduceMLALegal(EVT AccVT, EVT InputVT) const { - return getPartialReduceMLAAction(AccVT, InputVT) == Legal; - } - /// If the action for this operation is to promote, this method returns the /// ValueType to promote to. MVT getTypeToPromoteTo(unsigned Op, MVT VT) const { From 2aabffb351aed75cafe17222d68fc2007cf6eba0 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 30 Apr 2025 15:09:49 +0100 Subject: [PATCH 09/12] New test precommit --- .../AArch64/sve-partial-reduce-dot-product.ll | 456 ++++++++++-------- 1 file changed, 256 insertions(+), 200 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index db3852a7c02b6..02bbb0a70dba1 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -198,42 +198,42 @@ define @udot_8to64( %acc, @sdot_8to64( %acc, @usdot_8to64( %acc, @sudot_8to64( %acc, @udot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) @@ -667,26 +667,26 @@ define @sdot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) @@ -1138,3 +1138,59 @@ entry: %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16( %acc, %mult) ret %partial.reduce } + + +define @partial_reduce_only_split_acc( %acc, %a, %b) { +; CHECK-LABEL: partial_reduce_only_split_acc: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEXT: and z3.h, z3.h, #0xff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z6.d, z4.s +; CHECK-NEXT: uunpklo z7.d, z2.s +; CHECK-NEXT: uunpklo z24.d, z5.s +; CHECK-NEXT: uunpklo z25.d, z3.s +; CHECK-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z5.d, z5.s +; CHECK-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEXT: mla z1.d, p0/m, z6.d, z24.d +; CHECK-NEXT: mla z0.d, p0/m, z7.d, z25.d +; CHECK-NEXT: mla z1.d, p0/m, z4.d, z5.d +; CHECK-NEXT: mla z0.d, p0/m, z2.d, z3.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z3.h, z3.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z6.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z2.d, z3.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = zext %a to + %b.wide = zext %b to + %mult = mul nuw nsw %a.wide, %b.wide + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64( + %acc, %mult) + ret %partial.reduce +} From b45dc07b4f8103f28e32bc16ce505e6b3f784631 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 30 Apr 2025 15:11:18 +0100 Subject: [PATCH 10/12] Fix typo and update test --- .../SelectionDAG/LegalizeVectorTypes.cpp | 2 +- .../AArch64/sve-partial-reduce-dot-product.ll | 18 +----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 884a428ea9a60..e4db1663f8daa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3230,7 +3230,7 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, // If the input types don't need splitting, just accumulate into the // low part of the accumulator. - if (getTypeAction(Input1.getValueType()) == TargetLowering::TypeSplitVector) { + if (getTypeAction(Input1.getValueType()) != TargetLowering::TypeSplitVector) { Lo = DAG.getNode(Opcode, DL, AccLo.getValueType(), AccLo, Input1, Input2); Hi = AccHi; return; diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 02bbb0a70dba1..039cac01008b8 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -1168,23 +1168,7 @@ define @partial_reduce_only_split_acc( %acc ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: and z3.h, z3.h, #0xff ; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff -; CHECK-NEWLOWERING-NEXT: ptrue p0.d -; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h -; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h -; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s -; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s -; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z6.d -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d -; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z4.d -; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z2.d, z3.d +; CHECK-NEWLOWERING-NEXT: udot z0.d, z2.h, z3.h ; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to From c1c94ceca5c73e23bb3987d806e33890481fcf2c Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Wed, 30 Apr 2025 16:07:39 +0100 Subject: [PATCH 11/12] Address nits --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index e4db1663f8daa..d0b69b88748a9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4523,22 +4523,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { } SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { - SDValue Acc = N->getOperand(0); assert(getTypeAction(Acc.getValueType()) != TargetLowering::TypeSplitVector && "Accumulator should already be a legal type, and shouldn't need " "further splitting"); - SDValue Input1 = N->getOperand(1); SDLoc DL(N); - SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi; - std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL); + std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(N->getOperand(1), DL); std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL); unsigned Opcode = N->getOpcode(); EVT ResultVT = Acc.getValueType(); - auto Lo = DAG.getNode(Opcode, DL, ResultVT, Acc, Input1Lo, Input2Lo); + SDValue Lo = DAG.getNode(Opcode, DL, ResultVT, Acc, Input1Lo, Input2Lo); return DAG.getNode(Opcode, DL, ResultVT, Lo, Input1Hi, Input2Hi); } From 7c2ac73d3934b389942637d9332e5249361c1e56 Mon Sep 17 00:00:00 2001 From: Nick Guy Date: Thu, 1 May 2025 12:50:38 +0100 Subject: [PATCH 12/12] Update tests --- .../neon-partial-reduce-dot-product.ll | 100 +++++++++--------- .../CodeGen/AArch64/partial-reduction-add.ll | 4 +- .../AArch64/sve-partial-reduce-wide-add.ll | 20 ++-- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 06e9bc901ab36..ab9813aa796e3 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -402,18 +402,18 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: ushll v5.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NODOT-NEXT: ushll v3.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v5.2s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v5.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s +; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = zext <16 x i8> %a to <16 x i64> @@ -437,18 +437,18 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NODOT: // %bb.0: // %entry ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b -; CHECK-NODOT-NEXT: sshll v5.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0 +; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-NODOT-NEXT: sshll v3.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v5.2s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v5.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s +; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s +; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s ; CHECK-NODOT-NEXT: ret entry: %a.wide = sext <16 x i8> %a to <16 x i64> @@ -463,25 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ ; CHECK-NOI8MM-LABEL: usdot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v7.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v16.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v17.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0 -; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0 -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: sshll v5.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s ; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s ; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: usdot_8to64: @@ -504,25 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { ; CHECK-NOI8MM-LABEL: sudot_8to64: ; CHECK-NOI8MM: // %bb.0: // %entry ; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 -; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0 ; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: sshll v7.4s, v2.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v16.4s, v5.4h, #0 +; CHECK-NOI8MM-NEXT: ushll v17.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0 -; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: sshll v4.4s, v2.4h, #0 -; CHECK-NOI8MM-NEXT: ushll v5.4s, v3.4h, #0 ; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0 ; CHECK-NOI8MM-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s +; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s ; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s +; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s ; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s -; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s -; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s +; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s ; CHECK-NOI8MM-NEXT: ret ; ; CHECK-I8MM-LABEL: sudot_8to64: @@ -705,17 +705,17 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0 +; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: ushll v4.4s, v2.4h, #0 -; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret %a.wide = zext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) @@ -737,17 +737,17 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0 ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0 +; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: sshll v4.4s, v2.4h, #0 -; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s +; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s -; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s ; CHECK-NODOT-NEXT: ret %a.wide = sext <16 x i8> %a to <16 x i64> %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll index 3810374b18fbe..c3828c3d695c4 100644 --- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -62,10 +62,10 @@ entry: define @partial_reduce_add_half_8( %accumulator, %0) #0 { ; CHECK-LABEL: partial_reduce_add_half_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add z3.s, z3.s, z4.s ; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z4.s ; CHECK-NEXT: add z0.s, z0.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z5.s +; CHECK-NEXT: add z1.s, z1.s, z5.s ; CHECK-NEXT: ret entry: %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index 1fe8628357783..5148d3da6c737 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -206,13 +206,13 @@ define @signed_wide_add_nxv8i32( %acc, %input to @@ -224,13 +224,13 @@ define @unsigned_wide_add_nxv8i32( %acc, %input to