60
60
#include " llvm/IR/Instruction.h"
61
61
#include " llvm/IR/Instructions.h"
62
62
#include " llvm/IR/IntrinsicInst.h"
63
+ #include " llvm/IR/PatternMatch.h"
63
64
#include " llvm/InitializePasses.h"
64
65
#include " llvm/Pass.h"
65
66
#include " llvm/Support/Casting.h"
@@ -478,23 +479,184 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
478
479
return true ;
479
480
}
480
481
482
+ // For an (de)interleave tree like this:
483
+ //
484
+ // A C B D
485
+ // |___| |___|
486
+ // |_____|
487
+ // |
488
+ // A B C D
489
+ //
490
+ // We will get ABCD at the end while the leaf operands/results
491
+ // are ACBD, which are also what we initially collected in
492
+ // getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
493
+ // hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
494
+ // to reorder them by interleaving these values.
495
+ static void interleaveLeafValues (MutableArrayRef<Value *> SubLeaves) {
496
+ int NumLeaves = SubLeaves.size ();
497
+ if (NumLeaves == 2 )
498
+ return ;
499
+
500
+ assert (isPowerOf2_32 (NumLeaves) && NumLeaves > 1 );
501
+
502
+ const int HalfLeaves = NumLeaves / 2 ;
503
+ // Visit the sub-trees.
504
+ interleaveLeafValues (SubLeaves.take_front (HalfLeaves));
505
+ interleaveLeafValues (SubLeaves.drop_front (HalfLeaves));
506
+
507
+ SmallVector<Value *, 8 > Buffer;
508
+ // The step is alternating between +half and -half+1. We exit the
509
+ // loop right before the last element because given the fact that
510
+ // SubLeaves always has an even number of elements, the last element
511
+ // will never be moved and the last to be visited. This simplifies
512
+ // the exit condition.
513
+ for (int i = 0 ; i < NumLeaves - 1 ;
514
+ (i < HalfLeaves) ? i += HalfLeaves : i += (1 - HalfLeaves))
515
+ Buffer.push_back (SubLeaves[i]);
516
+
517
+ llvm::copy (Buffer, SubLeaves.begin ());
518
+ }
519
+
520
+ static bool
521
+ getVectorInterleaveFactor (IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
522
+ SmallVectorImpl<Instruction *> &DeadInsts) {
523
+ if (II->getIntrinsicID () != Intrinsic::vector_interleave2)
524
+ return false ;
525
+
526
+ // Visit with BFS
527
+ SmallVector<IntrinsicInst *, 8 > Queue;
528
+ Queue.push_back (II);
529
+ while (!Queue.empty ()) {
530
+ IntrinsicInst *Current = Queue.front ();
531
+ Queue.erase (Queue.begin ());
532
+
533
+ // All the intermediate intrinsics will be deleted.
534
+ DeadInsts.push_back (Current);
535
+
536
+ for (unsigned I = 0 ; I < 2 ; ++I) {
537
+ Value *Op = Current->getOperand (I);
538
+ if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
539
+ if (OpII->getIntrinsicID () == Intrinsic::vector_interleave2) {
540
+ Queue.push_back (OpII);
541
+ continue ;
542
+ }
543
+
544
+ // If this is not a perfectly balanced tree, the leaf
545
+ // result types would be different.
546
+ if (!Operands.empty () && Op->getType () != Operands.back ()->getType ())
547
+ return false ;
548
+
549
+ Operands.push_back (Op);
550
+ }
551
+ }
552
+
553
+ const unsigned Factor = Operands.size ();
554
+ // Currently we only recognize power-of-two factors.
555
+ // FIXME: should we assert here instead?
556
+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
557
+ return false ;
558
+
559
+ interleaveLeafValues (Operands);
560
+ return true ;
561
+ }
562
+
563
+ static bool
564
+ getVectorDeinterleaveFactor (IntrinsicInst *II,
565
+ SmallVectorImpl<Value *> &Results,
566
+ SmallVectorImpl<Instruction *> &DeadInsts) {
567
+ using namespace PatternMatch ;
568
+ if (II->getIntrinsicID () != Intrinsic::vector_deinterleave2 ||
569
+ !II->hasNUses (2 ))
570
+ return false ;
571
+
572
+ // Visit with BFS
573
+ SmallVector<IntrinsicInst *, 8 > Queue;
574
+ Queue.push_back (II);
575
+ while (!Queue.empty ()) {
576
+ IntrinsicInst *Current = Queue.front ();
577
+ Queue.erase (Queue.begin ());
578
+ assert (Current->hasNUses (2 ));
579
+
580
+ // All the intermediate intrinsics will be deleted from the bottom-up.
581
+ DeadInsts.insert (DeadInsts.begin (), Current);
582
+
583
+ ExtractValueInst *LHS = nullptr , *RHS = nullptr ;
584
+ for (User *Usr : Current->users ()) {
585
+ if (!isa<ExtractValueInst>(Usr))
586
+ return 0 ;
587
+
588
+ auto *EV = cast<ExtractValueInst>(Usr);
589
+ // Intermediate ExtractValue instructions will also be deleted.
590
+ DeadInsts.insert (DeadInsts.begin (), EV);
591
+ ArrayRef<unsigned > Indices = EV->getIndices ();
592
+ if (Indices.size () != 1 )
593
+ return false ;
594
+
595
+ if (Indices[0 ] == 0 && !LHS)
596
+ LHS = EV;
597
+ else if (Indices[0 ] == 1 && !RHS)
598
+ RHS = EV;
599
+ else
600
+ return false ;
601
+ }
602
+
603
+ // We have legal indices. At this point we're either going
604
+ // to continue the traversal or push the leaf values into Results.
605
+ for (ExtractValueInst *EV : {LHS, RHS}) {
606
+ // Continue the traversal. We're playing safe here and matching only the
607
+ // expression consisting of a perfectly balanced binary tree in which all
608
+ // intermediate values are only used once.
609
+ if (EV->hasOneUse () &&
610
+ match (EV->user_back (),
611
+ m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
612
+ EV->user_back ()->hasNUses (2 )) {
613
+ auto *EVUsr = cast<IntrinsicInst>(EV->user_back ());
614
+ Queue.push_back (EVUsr);
615
+ continue ;
616
+ }
617
+
618
+ // If this is not a perfectly balanced tree, the leaf
619
+ // result types would be different.
620
+ if (!Results.empty () && EV->getType () != Results.back ()->getType ())
621
+ return false ;
622
+
623
+ // Save the leaf value.
624
+ Results.push_back (EV);
625
+ }
626
+ }
627
+
628
+ const unsigned Factor = Results.size ();
629
+ // Currently we only recognize power-of-two factors.
630
+ // FIXME: should we assert here instead?
631
+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
632
+ return 0 ;
633
+
634
+ interleaveLeafValues (Results);
635
+ return true ;
636
+ }
637
+
481
638
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic (
482
639
IntrinsicInst *DI, SmallSetVector<Instruction *, 32 > &DeadInsts) {
483
640
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand (0 ));
484
641
485
642
if (!LI || !LI->hasOneUse () || !LI->isSimple ())
486
643
return false ;
487
644
488
- LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI << " \n " );
645
+ SmallVector<Value *, 8 > DeinterleaveValues;
646
+ SmallVector<Instruction *, 8 > DeinterleaveDeadInsts;
647
+ if (!getVectorDeinterleaveFactor (DI, DeinterleaveValues,
648
+ DeinterleaveDeadInsts))
649
+ return false ;
650
+
651
+ LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI
652
+ << " with factor = " << DeinterleaveValues.size () << " \n " );
489
653
490
654
// Try and match this with target specific intrinsics.
491
- SmallVector<Instruction *, 4 > DeinterleaveDeadInsts;
492
- if (!TLI->lowerDeinterleaveIntrinsicToLoad (DI, LI, DeinterleaveDeadInsts))
655
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad (DI, LI, DeinterleaveValues))
493
656
return false ;
494
657
495
658
DeadInsts.insert (DeinterleaveDeadInsts.begin (), DeinterleaveDeadInsts.end ());
496
659
// We now have a target-specific load, so delete the old one.
497
- DeadInsts.insert (DI);
498
660
DeadInsts.insert (LI);
499
661
return true ;
500
662
}
@@ -509,16 +671,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
509
671
if (!SI || !SI->isSimple ())
510
672
return false ;
511
673
512
- LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II << " \n " );
674
+ SmallVector<Value *, 8 > InterleaveValues;
675
+ SmallVector<Instruction *, 8 > InterleaveDeadInsts;
676
+ if (!getVectorInterleaveFactor (II, InterleaveValues, InterleaveDeadInsts))
677
+ return false ;
678
+
679
+ LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II
680
+ << " with factor = " << InterleaveValues.size () << " \n " );
513
681
514
- SmallVector<Instruction *, 4 > InterleaveDeadInsts;
515
682
// Try and match this with target specific intrinsics.
516
- if (!TLI->lowerInterleaveIntrinsicToStore (II, SI, InterleaveDeadInsts ))
683
+ if (!TLI->lowerInterleaveIntrinsicToStore (II, SI, InterleaveValues ))
517
684
return false ;
518
685
519
686
// We now have a target-specific store, so delete the old one.
520
687
DeadInsts.insert (SI);
521
- DeadInsts.insert (II);
522
688
DeadInsts.insert (InterleaveDeadInsts.begin (), InterleaveDeadInsts.end ());
523
689
return true ;
524
690
}
0 commit comments