diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 9d7ade8eb523b..02537fe835083 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19209,6 +19209,37 @@ will be on any later loop iteration. This intrinsic will only return 0 if the input count is also 0. A non-zero input count will produce a non-zero result. +'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) + declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %a, %b) + declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %a, %b) + +Overview: +""""""""" + +The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the +concatenation of the two vector operands down to the number of elements dictated +by the result type. The result type is a vector type that matches the type of the +first operand vector. + +Arguments: +"""""""""" + +Both arguments must be vectors of matching element types. The first argument type must +match the result type, while the second argument type must have a vector length that is a +positive integer multiple of the first vector/result type. The arguments must be either be +both fixed or both scalable vectors. + + '``llvm.experimental.vector.histogram.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 107442623ab7b..5b3e3d2387463 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2635,6 +2635,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType [llvm_anyvector_ty], [IntrNoMem]>; +//===-------------- Intrinsics to perform partial reduction ---------------===// + +def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], + [llvm_anyvector_ty, llvm_anyvector_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ba76456b5836a..76bac44f61e9c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -104,6 +104,7 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/Local.h" #include +#include #include #include #include @@ -7914,6 +7915,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, Trunc); return; } + case Intrinsic::experimental_vector_partial_reduce_add: { + SDValue OpNode = getValue(I.getOperand(1)); + EVT ReducedTy = EVT::getEVT(I.getType()); + EVT FullTy = OpNode.getValueType(); + + unsigned Stride = ReducedTy.getVectorMinNumElements(); + unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; + + // Collect all of the subvectors + std::deque Subvectors; + Subvectors.push_back(getValue(I.getOperand(0))); + for (unsigned i = 0; i < ScaleFactor; i++) { + auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl); + Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy, + {OpNode, SourceIndex})); + } + + // Flatten the subvector tree + while (Subvectors.size() > 1) { + Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy, + {Subvectors[0], Subvectors[1]})); + Subvectors.pop_front(); + Subvectors.pop_front(); + } + + assert(Subvectors.size() == 1 && + "There should only be one subvector after tree flattening"); + + setValue(&I, Subvectors[0]); + return; + } case Intrinsic::experimental_cttz_elts: { auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 684e54444621b..ddefeb5a03e97 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6131,6 +6131,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } break; } + case Intrinsic::experimental_vector_partial_reduce_add: { + VectorType *AccTy = cast(Call.getArgOperand(0)->getType()); + VectorType *VecTy = cast(Call.getArgOperand(1)->getType()); + + unsigned VecWidth = VecTy->getElementCount().getKnownMinValue(); + unsigned AccWidth = AccTy->getElementCount().getKnownMinValue(); + + Check((VecWidth % AccWidth) == 0, + "Invalid vector widths for partial " + "reduction. The width of the input vector " + "must be a positive integer multiple of " + "the width of the accumulator vector."); + break; + } case Intrinsic::experimental_noalias_scope_decl: { NoAliasScopeDecls.push_back(cast(&Call)); break; diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll new file mode 100644 index 0000000000000..ae681ee54e687 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 { +; CHECK-LABEL: partial_reduce_add_fixed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0) + ret <4 x i32> %partial.reduce +} + +define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 { +; CHECK-LABEL: partial_reduce_add_fixed_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0) + ret <4 x i32> %partial.reduce +} + +define @partial_reduce_add( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32( %accumulator, %0) + ret %partial.reduce +} + +define @partial_reduce_add_half( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %accumulator, %0) + ret %partial.reduce +} + +define @partial_reduce_add_quart( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_quart: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z2.s, z2.s, z3.s +; CHECK-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %accumulator, %0) + ret %partial.reduce +} + +define @partial_reduce_add_half_8( %accumulator, %0) #0 { +; CHECK-LABEL: partial_reduce_add_half_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z4.s, z0.s +; CHECK-NEXT: add z1.s, z5.s, z1.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32( %accumulator, %0) + ret %partial.reduce +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(, ) + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i32 @llvm.vector.reduce.add.nxv8i32() + +attributes #0 = { "target-features"="+sve2" }