Skip to content

Commit bc74a1e

Browse files
authored
[IA] Generalize the support for power-of-two (de)interleave intrinsics (#123863)
Previously, AArch64 used pattern matching to support llvm.vector.(de)interleave of 2 and 4; RISC-V only supported (de)interleave of 2. This patch consolidates the logics in these two targets by factoring out the common factor calculations into the InterleaveAccess Pass.
1 parent 1077280 commit bc74a1e

11 files changed

+820
-372
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3156,27 +3156,23 @@ class TargetLoweringBase {
31563156
/// Return true on success. Currently only supports
31573157
/// llvm.vector.deinterleave2
31583158
///
3159-
/// \p DI is the deinterleave intrinsic.
3160-
/// \p LI is the accompanying load instruction
3161-
/// \p DeadInsts is a reference to a vector that keeps track of dead
3162-
/// instruction during transformations.
3163-
virtual bool lowerDeinterleaveIntrinsicToLoad(
3164-
IntrinsicInst *DI, LoadInst *LI,
3165-
SmallVectorImpl<Instruction *> &DeadInsts) const {
3159+
/// \p LI is the accompanying load instruction.
3160+
/// \p DeinterleaveValues contains the deinterleaved values.
3161+
virtual bool
3162+
lowerDeinterleaveIntrinsicToLoad(LoadInst *LI,
3163+
ArrayRef<Value *> DeinterleaveValues) const {
31663164
return false;
31673165
}
31683166

31693167
/// Lower an interleave intrinsic to a target specific store intrinsic.
31703168
/// Return true on success. Currently only supports
31713169
/// llvm.vector.interleave2
31723170
///
3173-
/// \p II is the interleave intrinsic.
31743171
/// \p SI is the accompanying store instruction
3175-
/// \p DeadInsts is a reference to a vector that keeps track of dead
3176-
/// instruction during transformations.
3177-
virtual bool lowerInterleaveIntrinsicToStore(
3178-
IntrinsicInst *II, StoreInst *SI,
3179-
SmallVectorImpl<Instruction *> &DeadInsts) const {
3172+
/// \p InterleaveValues contains the interleaved values.
3173+
virtual bool
3174+
lowerInterleaveIntrinsicToStore(StoreInst *SI,
3175+
ArrayRef<Value *> InterleaveValues) const {
31803176
return false;
31813177
}
31823178

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 169 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#include "llvm/IR/Instruction.h"
6161
#include "llvm/IR/Instructions.h"
6262
#include "llvm/IR/IntrinsicInst.h"
63+
#include "llvm/IR/PatternMatch.h"
6364
#include "llvm/InitializePasses.h"
6465
#include "llvm/Pass.h"
6566
#include "llvm/Support/Casting.h"
@@ -478,23 +479,179 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
478479
return true;
479480
}
480481

482+
// For an (de)interleave tree like this:
483+
//
484+
// A C B D
485+
// |___| |___|
486+
// |_____|
487+
// |
488+
// A B C D
489+
//
490+
// We will get ABCD at the end while the leaf operands/results
491+
// are ACBD, which are also what we initially collected in
492+
// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
493+
// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
494+
// to reorder them by interleaving these values.
495+
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
496+
unsigned NumLeaves = SubLeaves.size();
497+
if (NumLeaves == 2)
498+
return;
499+
500+
assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
501+
502+
const unsigned HalfLeaves = NumLeaves / 2;
503+
// Visit the sub-trees.
504+
interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
505+
interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
506+
507+
SmallVector<Value *, 8> Buffer;
508+
// a0 a1 a2 a3 b0 b1 b2 b3
509+
// -> a0 b0 a1 b1 a2 b2 a3 b3
510+
for (unsigned i = 0U; i < NumLeaves; ++i)
511+
Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
512+
513+
llvm::copy(Buffer, SubLeaves.begin());
514+
}
515+
516+
static bool
517+
getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
518+
SmallVectorImpl<Instruction *> &DeadInsts) {
519+
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
520+
521+
// Visit with BFS
522+
SmallVector<IntrinsicInst *, 8> Queue;
523+
Queue.push_back(II);
524+
while (!Queue.empty()) {
525+
IntrinsicInst *Current = Queue.front();
526+
Queue.erase(Queue.begin());
527+
528+
// All the intermediate intrinsics will be deleted.
529+
DeadInsts.push_back(Current);
530+
531+
for (unsigned I = 0; I < 2; ++I) {
532+
Value *Op = Current->getOperand(I);
533+
if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
534+
if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
535+
Queue.push_back(OpII);
536+
continue;
537+
}
538+
539+
// If this is not a perfectly balanced tree, the leaf
540+
// result types would be different.
541+
if (!Operands.empty() && Op->getType() != Operands.back()->getType())
542+
return false;
543+
544+
Operands.push_back(Op);
545+
}
546+
}
547+
548+
const unsigned Factor = Operands.size();
549+
// Currently we only recognize power-of-two factors.
550+
// FIXME: should we assert here instead?
551+
if (Factor <= 1 || !isPowerOf2_32(Factor))
552+
return false;
553+
554+
interleaveLeafValues(Operands);
555+
return true;
556+
}
557+
558+
static bool
559+
getVectorDeinterleaveFactor(IntrinsicInst *II,
560+
SmallVectorImpl<Value *> &Results,
561+
SmallVectorImpl<Instruction *> &DeadInsts) {
562+
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
563+
using namespace PatternMatch;
564+
if (!II->hasNUses(2))
565+
return false;
566+
567+
// Visit with BFS
568+
SmallVector<IntrinsicInst *, 8> Queue;
569+
Queue.push_back(II);
570+
while (!Queue.empty()) {
571+
IntrinsicInst *Current = Queue.front();
572+
Queue.erase(Queue.begin());
573+
assert(Current->hasNUses(2));
574+
575+
// All the intermediate intrinsics will be deleted from the bottom-up.
576+
DeadInsts.insert(DeadInsts.begin(), Current);
577+
578+
ExtractValueInst *LHS = nullptr, *RHS = nullptr;
579+
for (User *Usr : Current->users()) {
580+
if (!isa<ExtractValueInst>(Usr))
581+
return 0;
582+
583+
auto *EV = cast<ExtractValueInst>(Usr);
584+
// Intermediate ExtractValue instructions will also be deleted.
585+
DeadInsts.insert(DeadInsts.begin(), EV);
586+
ArrayRef<unsigned> Indices = EV->getIndices();
587+
if (Indices.size() != 1)
588+
return false;
589+
590+
if (Indices[0] == 0 && !LHS)
591+
LHS = EV;
592+
else if (Indices[0] == 1 && !RHS)
593+
RHS = EV;
594+
else
595+
return false;
596+
}
597+
598+
// We have legal indices. At this point we're either going
599+
// to continue the traversal or push the leaf values into Results.
600+
for (ExtractValueInst *EV : {LHS, RHS}) {
601+
// Continue the traversal. We're playing safe here and matching only the
602+
// expression consisting of a perfectly balanced binary tree in which all
603+
// intermediate values are only used once.
604+
if (EV->hasOneUse() &&
605+
match(EV->user_back(),
606+
m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
607+
EV->user_back()->hasNUses(2)) {
608+
auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
609+
Queue.push_back(EVUsr);
610+
continue;
611+
}
612+
613+
// If this is not a perfectly balanced tree, the leaf
614+
// result types would be different.
615+
if (!Results.empty() && EV->getType() != Results.back()->getType())
616+
return false;
617+
618+
// Save the leaf value.
619+
Results.push_back(EV);
620+
}
621+
}
622+
623+
const unsigned Factor = Results.size();
624+
// Currently we only recognize power-of-two factors.
625+
// FIXME: should we assert here instead?
626+
if (Factor <= 1 || !isPowerOf2_32(Factor))
627+
return 0;
628+
629+
interleaveLeafValues(Results);
630+
return true;
631+
}
632+
481633
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
482634
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
483635
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
484636

485637
if (!LI || !LI->hasOneUse() || !LI->isSimple())
486638
return false;
487639

488-
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
640+
SmallVector<Value *, 8> DeinterleaveValues;
641+
SmallVector<Instruction *, 8> DeinterleaveDeadInsts;
642+
if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues,
643+
DeinterleaveDeadInsts))
644+
return false;
645+
646+
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI
647+
<< " with factor = " << DeinterleaveValues.size() << "\n");
489648

490649
// Try and match this with target specific intrinsics.
491-
SmallVector<Instruction *, 4> DeinterleaveDeadInsts;
492-
if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeinterleaveDeadInsts))
650+
if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
493651
return false;
494652

495653
DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end());
496654
// We now have a target-specific load, so delete the old one.
497-
DeadInsts.insert(DI);
498655
DeadInsts.insert(LI);
499656
return true;
500657
}
@@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
509666
if (!SI || !SI->isSimple())
510667
return false;
511668

512-
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
669+
SmallVector<Value *, 8> InterleaveValues;
670+
SmallVector<Instruction *, 8> InterleaveDeadInsts;
671+
if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
672+
return false;
673+
674+
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II
675+
<< " with factor = " << InterleaveValues.size() << "\n");
513676

514-
SmallVector<Instruction *, 4> InterleaveDeadInsts;
515677
// Try and match this with target specific intrinsics.
516-
if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
678+
if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
517679
return false;
518680

519681
// We now have a target-specific store, so delete the old one.
520682
DeadInsts.insert(SI);
521-
DeadInsts.insert(II);
522683
DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end());
523684
return true;
524685
}

0 commit comments

Comments
 (0)