6060#include " llvm/IR/Instruction.h"
6161#include " llvm/IR/Instructions.h"
6262#include " llvm/IR/IntrinsicInst.h"
63+ #include " llvm/IR/PatternMatch.h"
6364#include " llvm/InitializePasses.h"
6465#include " llvm/Pass.h"
6566#include " llvm/Support/Casting.h"
@@ -478,23 +479,179 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
478479 return true ;
479480}
480481
482+ // For an (de)interleave tree like this:
483+ //
484+ // A C B D
485+ // |___| |___|
486+ // |_____|
487+ // |
488+ // A B C D
489+ //
490+ // We will get ABCD at the end while the leaf operands/results
491+ // are ACBD, which are also what we initially collected in
492+ // getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
493+ // hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
494+ // to reorder them by interleaving these values.
495+ static void interleaveLeafValues (MutableArrayRef<Value *> SubLeaves) {
496+ unsigned NumLeaves = SubLeaves.size ();
497+ if (NumLeaves == 2 )
498+ return ;
499+
500+ assert (isPowerOf2_32 (NumLeaves) && NumLeaves > 1 );
501+
502+ const unsigned HalfLeaves = NumLeaves / 2 ;
503+ // Visit the sub-trees.
504+ interleaveLeafValues (SubLeaves.take_front (HalfLeaves));
505+ interleaveLeafValues (SubLeaves.drop_front (HalfLeaves));
506+
507+ SmallVector<Value *, 8 > Buffer;
508+ // a0 a1 a2 a3 b0 b1 b2 b3
509+ // -> a0 b0 a1 b1 a2 b2 a3 b3
510+ for (unsigned i = 0U ; i < NumLeaves; ++i)
511+ Buffer.push_back (SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0 )]);
512+
513+ llvm::copy (Buffer, SubLeaves.begin ());
514+ }
515+
516+ static bool
517+ getVectorInterleaveFactor (IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
518+ SmallVectorImpl<Instruction *> &DeadInsts) {
519+ assert (II->getIntrinsicID () == Intrinsic::vector_interleave2);
520+
521+ // Visit with BFS
522+ SmallVector<IntrinsicInst *, 8 > Queue;
523+ Queue.push_back (II);
524+ while (!Queue.empty ()) {
525+ IntrinsicInst *Current = Queue.front ();
526+ Queue.erase (Queue.begin ());
527+
528+ // All the intermediate intrinsics will be deleted.
529+ DeadInsts.push_back (Current);
530+
531+ for (unsigned I = 0 ; I < 2 ; ++I) {
532+ Value *Op = Current->getOperand (I);
533+ if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
534+ if (OpII->getIntrinsicID () == Intrinsic::vector_interleave2) {
535+ Queue.push_back (OpII);
536+ continue ;
537+ }
538+
539+ // If this is not a perfectly balanced tree, the leaf
540+ // result types would be different.
541+ if (!Operands.empty () && Op->getType () != Operands.back ()->getType ())
542+ return false ;
543+
544+ Operands.push_back (Op);
545+ }
546+ }
547+
548+ const unsigned Factor = Operands.size ();
549+ // Currently we only recognize power-of-two factors.
550+ // FIXME: should we assert here instead?
551+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
552+ return false ;
553+
554+ interleaveLeafValues (Operands);
555+ return true ;
556+ }
557+
558+ static bool
559+ getVectorDeinterleaveFactor (IntrinsicInst *II,
560+ SmallVectorImpl<Value *> &Results,
561+ SmallVectorImpl<Instruction *> &DeadInsts) {
562+ assert (II->getIntrinsicID () == Intrinsic::vector_deinterleave2);
563+ using namespace PatternMatch ;
564+ if (!II->hasNUses (2 ))
565+ return false ;
566+
567+ // Visit with BFS
568+ SmallVector<IntrinsicInst *, 8 > Queue;
569+ Queue.push_back (II);
570+ while (!Queue.empty ()) {
571+ IntrinsicInst *Current = Queue.front ();
572+ Queue.erase (Queue.begin ());
573+ assert (Current->hasNUses (2 ));
574+
575+ // All the intermediate intrinsics will be deleted from the bottom-up.
576+ DeadInsts.insert (DeadInsts.begin (), Current);
577+
578+ ExtractValueInst *LHS = nullptr , *RHS = nullptr ;
579+ for (User *Usr : Current->users ()) {
580+ if (!isa<ExtractValueInst>(Usr))
581+ return 0 ;
582+
583+ auto *EV = cast<ExtractValueInst>(Usr);
584+ // Intermediate ExtractValue instructions will also be deleted.
585+ DeadInsts.insert (DeadInsts.begin (), EV);
586+ ArrayRef<unsigned > Indices = EV->getIndices ();
587+ if (Indices.size () != 1 )
588+ return false ;
589+
590+ if (Indices[0 ] == 0 && !LHS)
591+ LHS = EV;
592+ else if (Indices[0 ] == 1 && !RHS)
593+ RHS = EV;
594+ else
595+ return false ;
596+ }
597+
598+ // We have legal indices. At this point we're either going
599+ // to continue the traversal or push the leaf values into Results.
600+ for (ExtractValueInst *EV : {LHS, RHS}) {
601+ // Continue the traversal. We're playing safe here and matching only the
602+ // expression consisting of a perfectly balanced binary tree in which all
603+ // intermediate values are only used once.
604+ if (EV->hasOneUse () &&
605+ match (EV->user_back (),
606+ m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
607+ EV->user_back ()->hasNUses (2 )) {
608+ auto *EVUsr = cast<IntrinsicInst>(EV->user_back ());
609+ Queue.push_back (EVUsr);
610+ continue ;
611+ }
612+
613+ // If this is not a perfectly balanced tree, the leaf
614+ // result types would be different.
615+ if (!Results.empty () && EV->getType () != Results.back ()->getType ())
616+ return false ;
617+
618+ // Save the leaf value.
619+ Results.push_back (EV);
620+ }
621+ }
622+
623+ const unsigned Factor = Results.size ();
624+ // Currently we only recognize power-of-two factors.
625+ // FIXME: should we assert here instead?
626+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
627+ return 0 ;
628+
629+ interleaveLeafValues (Results);
630+ return true ;
631+ }
632+
481633bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic (
482634 IntrinsicInst *DI, SmallSetVector<Instruction *, 32 > &DeadInsts) {
483635 LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand (0 ));
484636
485637 if (!LI || !LI->hasOneUse () || !LI->isSimple ())
486638 return false ;
487639
488- LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI << " \n " );
640+ SmallVector<Value *, 8 > DeinterleaveValues;
641+ SmallVector<Instruction *, 8 > DeinterleaveDeadInsts;
642+ if (!getVectorDeinterleaveFactor (DI, DeinterleaveValues,
643+ DeinterleaveDeadInsts))
644+ return false ;
645+
646+ LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI
647+ << " with factor = " << DeinterleaveValues.size () << " \n " );
489648
490649 // Try and match this with target specific intrinsics.
491- SmallVector<Instruction *, 4 > DeinterleaveDeadInsts;
492- if (!TLI->lowerDeinterleaveIntrinsicToLoad (DI, LI, DeinterleaveDeadInsts))
650+ if (!TLI->lowerDeinterleaveIntrinsicToLoad (LI, DeinterleaveValues))
493651 return false ;
494652
495653 DeadInsts.insert (DeinterleaveDeadInsts.begin (), DeinterleaveDeadInsts.end ());
496654 // We now have a target-specific load, so delete the old one.
497- DeadInsts.insert (DI);
498655 DeadInsts.insert (LI);
499656 return true ;
500657}
@@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
509666 if (!SI || !SI->isSimple ())
510667 return false ;
511668
512- LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II << " \n " );
669+ SmallVector<Value *, 8 > InterleaveValues;
670+ SmallVector<Instruction *, 8 > InterleaveDeadInsts;
671+ if (!getVectorInterleaveFactor (II, InterleaveValues, InterleaveDeadInsts))
672+ return false ;
673+
674+ LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II
675+ << " with factor = " << InterleaveValues.size () << " \n " );
513676
514- SmallVector<Instruction *, 4 > InterleaveDeadInsts;
515677 // Try and match this with target specific intrinsics.
516- if (!TLI->lowerInterleaveIntrinsicToStore (II, SI, InterleaveDeadInsts ))
678+ if (!TLI->lowerInterleaveIntrinsicToStore (SI, InterleaveValues ))
517679 return false ;
518680
519681 // We now have a target-specific store, so delete the old one.
520682 DeadInsts.insert (SI);
521- DeadInsts.insert (II);
522683 DeadInsts.insert (InterleaveDeadInsts.begin (), InterleaveDeadInsts.end ());
523684 return true ;
524685}
0 commit comments