Skip to content
This repository was archived by the owner on Sep 2, 2018. It is now read-only.

Commit ad69a64

Browse files
author
Chad Rosier
committed
[AArch64] Improve ISel using across lane addition reduction.
In vectorized add reduction code, the final "reduce" step is sub-optimal. This change wll combine : ext v1.16b, v0.16b, v0.16b, #8 add v0.4s, v1.4s, v0.4s dup v1.4s, v0.s[1] add v0.4s, v1.4s, v0.4s into addv s0, v0.4s PR21371 http://reviews.llvm.org/D12325 Patch by Jun Bum Lim <[email protected]>! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246790 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 7ab2009 commit ad69a64

File tree

2 files changed

+152
-0
lines changed

2 files changed

+152
-0
lines changed

lib/Target/AArch64/AArch64ISelLowering.cpp

+99
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
495495
setTargetDAGCombine(ISD::INTRINSIC_VOID);
496496
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
497497
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
498+
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
498499

499500
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
500501
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -8584,6 +8585,102 @@ static SDValue performPostLD1Combine(SDNode *N,
85848585
return SDValue();
85858586
}
85868587

8588+
/// Target-specific DAG combine for the across vector reduction.
8589+
/// This function specifically handles the final clean-up step of a vector
8590+
/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
8591+
/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
8592+
/// elements are reduced, where s is an induction variable from 0
8593+
/// to log2(NumVectorElements).
8594+
/// For example,
8595+
/// %1 = vector_shuffle %0, <2,3,u,u>
8596+
/// %2 = add %0, %1
8597+
/// %3 = vector_shuffle %2, <1,u,u,u>
8598+
/// %4 = add %2, %3
8599+
/// %5 = extract_vector_elt %4, 0
8600+
/// becomes :
8601+
/// %0 = uaddv %0
8602+
/// %1 = extract_vector_elt %0, 0
8603+
///
8604+
/// FIXME: Currently this function is implemented and tested specifically
8605+
/// for the add reduction. We could also support other types of across lane
8606+
/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
8607+
/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
8608+
static SDValue
8609+
performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
8610+
const AArch64Subtarget *Subtarget) {
8611+
if (!Subtarget->hasNEON())
8612+
return SDValue();
8613+
SDValue N0 = N->getOperand(0);
8614+
SDValue N1 = N->getOperand(1);
8615+
8616+
// Check if the input vector is fed by the operator we want to handle.
8617+
// We specifically check only ADD for now.
8618+
if (N0->getOpcode() != ISD::ADD)
8619+
return SDValue();
8620+
8621+
// The vector extract idx must constant zero because we only expect the final
8622+
// result of the reduction is placed in lane 0.
8623+
if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
8624+
return SDValue();
8625+
8626+
EVT EltTy = N0.getValueType().getVectorElementType();
8627+
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
8628+
return SDValue();
8629+
8630+
int NumVecElts = N0.getValueType().getVectorNumElements();
8631+
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
8632+
return SDValue();
8633+
8634+
int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
8635+
SDValue PreOp = N0;
8636+
// Iterate over each step of the across vector reduction.
8637+
for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
8638+
// We specifically check ADD for now.
8639+
if (PreOp.getOpcode() != ISD::ADD)
8640+
return SDValue();
8641+
SDValue CurOp = PreOp.getOperand(0);
8642+
SDValue Shuffle = PreOp.getOperand(1);
8643+
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
8644+
// Try to swap the 1st and 2nd operand as add is commutative.
8645+
CurOp = PreOp.getOperand(1);
8646+
Shuffle = PreOp.getOperand(0);
8647+
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
8648+
return SDValue();
8649+
}
8650+
// Check if it forms one step of the across vector reduction.
8651+
// E.g.,
8652+
// %cur = add %1, %0
8653+
// %shuffle = vector_shuffle %cur, <2, 3, u, u>
8654+
// %pre = add %cur, %shuffle
8655+
if (Shuffle.getOperand(0) != CurOp)
8656+
return SDValue();
8657+
8658+
int NumMaskElts = 1 << CurStep;
8659+
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
8660+
// Check mask values in each step.
8661+
// We expect the shuffle mask in each step follows a specific pattern
8662+
// denoted here by the <M, U> form, where M is a sequence of integers
8663+
// starting from NumMaskElts, increasing by 1, and the number integers
8664+
// in M should be NumMaskElts. U is a sequence of UNDEFs and the number
8665+
// of undef in U should be NumVecElts - NumMaskElts.
8666+
// E.g., for <8 x i16>, mask values in each step should be :
8667+
// step 0 : <1,u,u,u,u,u,u,u>
8668+
// step 1 : <2,3,u,u,u,u,u,u>
8669+
// step 2 : <4,5,6,7,u,u,u,u>
8670+
for (int i = 0; i < NumVecElts; ++i)
8671+
if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
8672+
(i >= NumMaskElts && !(Mask[i] < 0)))
8673+
return SDValue();
8674+
8675+
PreOp = CurOp;
8676+
}
8677+
SDLoc DL(N);
8678+
return DAG.getNode(
8679+
ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
8680+
DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
8681+
DAG.getConstant(0, DL, MVT::i64));
8682+
}
8683+
85878684
/// Target-specific DAG combine function for NEON load/store intrinsics
85888685
/// to merge base address updates.
85898686
static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -9178,6 +9275,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
91789275
return performNVCASTCombine(N);
91799276
case ISD::INSERT_VECTOR_ELT:
91809277
return performPostLD1Combine(N, DCI, true);
9278+
case ISD::EXTRACT_VECTOR_ELT:
9279+
return performAcrossLaneReductionCombine(N, DAG, Subtarget);
91819280
case ISD::INTRINSIC_VOID:
91829281
case ISD::INTRINSIC_W_CHAIN:
91839282
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {

test/CodeGen/AArch64/aarch64-addv.ll

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
; RUN: llc -march=aarch64 < %s | FileCheck %s
2+
3+
define i8 @f_v16i8(<16 x i8>* %arr) {
4+
; CHECK-LABEL: f_v16i8
5+
; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
6+
%bin.rdx = load <16 x i8>, <16 x i8>* %arr
7+
%rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
8+
%bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
9+
%rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
10+
%bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
11+
%rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
12+
%bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
13+
%rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
14+
%bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
15+
%r = extractelement <16 x i8> %bin.rdx14, i32 0
16+
ret i8 %r
17+
}
18+
19+
define i16 @f_v8i16(<8 x i16>* %arr) {
20+
; CHECK-LABEL: f_v8i16
21+
; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
22+
%bin.rdx = load <8 x i16>, <8 x i16>* %arr
23+
%rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
24+
%bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
25+
%rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
26+
%bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
27+
%rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
28+
%bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
29+
%r = extractelement <8 x i16> %bin.rdx14, i32 0
30+
ret i16 %r
31+
}
32+
33+
define i32 @f_v4i32( <4 x i32>* %arr) {
34+
; CHECK-LABEL: f_v4i32
35+
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
36+
%bin.rdx = load <4 x i32>, <4 x i32>* %arr
37+
%rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
38+
%bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
39+
%rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
40+
%bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
41+
%r = extractelement <4 x i32> %bin.rdx13, i32 0
42+
ret i32 %r
43+
}
44+
45+
define i64 @f_v2i64(<2 x i64>* %arr) {
46+
; CHECK-LABEL: f_v2i64
47+
; CHECK-NOT: addv
48+
%bin.rdx = load <2 x i64>, <2 x i64>* %arr
49+
%rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
50+
%bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
51+
%r = extractelement <2 x i64> %bin.rdx0, i32 0
52+
ret i64 %r
53+
}

0 commit comments

Comments
 (0)