60
60
#include " llvm/IR/Instruction.h"
61
61
#include " llvm/IR/Instructions.h"
62
62
#include " llvm/IR/IntrinsicInst.h"
63
+ #include " llvm/IR/PatternMatch.h"
63
64
#include " llvm/InitializePasses.h"
64
65
#include " llvm/Pass.h"
65
66
#include " llvm/Support/Casting.h"
@@ -478,23 +479,179 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
478
479
return true ;
479
480
}
480
481
482
+ // For an (de)interleave tree like this:
483
+ //
484
+ // A C B D
485
+ // |___| |___|
486
+ // |_____|
487
+ // |
488
+ // A B C D
489
+ //
490
+ // We will get ABCD at the end while the leaf operands/results
491
+ // are ACBD, which are also what we initially collected in
492
+ // getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
493
+ // hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
494
+ // to reorder them by interleaving these values.
495
+ static void interleaveLeafValues (MutableArrayRef<Value *> SubLeaves) {
496
+ unsigned NumLeaves = SubLeaves.size ();
497
+ if (NumLeaves == 2 )
498
+ return ;
499
+
500
+ assert (isPowerOf2_32 (NumLeaves) && NumLeaves > 1 );
501
+
502
+ const unsigned HalfLeaves = NumLeaves / 2 ;
503
+ // Visit the sub-trees.
504
+ interleaveLeafValues (SubLeaves.take_front (HalfLeaves));
505
+ interleaveLeafValues (SubLeaves.drop_front (HalfLeaves));
506
+
507
+ SmallVector<Value *, 8 > Buffer;
508
+ // a0 a1 a2 a3 b0 b1 b2 b3
509
+ // -> a0 b0 a1 b1 a2 b2 a3 b3
510
+ for (unsigned i = 0U ; i < NumLeaves; ++i)
511
+ Buffer.push_back (SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0 )]);
512
+
513
+ llvm::copy (Buffer, SubLeaves.begin ());
514
+ }
515
+
516
+ static bool
517
+ getVectorInterleaveFactor (IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
518
+ SmallVectorImpl<Instruction *> &DeadInsts) {
519
+ assert (II->getIntrinsicID () == Intrinsic::vector_interleave2);
520
+
521
+ // Visit with BFS
522
+ SmallVector<IntrinsicInst *, 8 > Queue;
523
+ Queue.push_back (II);
524
+ while (!Queue.empty ()) {
525
+ IntrinsicInst *Current = Queue.front ();
526
+ Queue.erase (Queue.begin ());
527
+
528
+ // All the intermediate intrinsics will be deleted.
529
+ DeadInsts.push_back (Current);
530
+
531
+ for (unsigned I = 0 ; I < 2 ; ++I) {
532
+ Value *Op = Current->getOperand (I);
533
+ if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
534
+ if (OpII->getIntrinsicID () == Intrinsic::vector_interleave2) {
535
+ Queue.push_back (OpII);
536
+ continue ;
537
+ }
538
+
539
+ // If this is not a perfectly balanced tree, the leaf
540
+ // result types would be different.
541
+ if (!Operands.empty () && Op->getType () != Operands.back ()->getType ())
542
+ return false ;
543
+
544
+ Operands.push_back (Op);
545
+ }
546
+ }
547
+
548
+ const unsigned Factor = Operands.size ();
549
+ // Currently we only recognize power-of-two factors.
550
+ // FIXME: should we assert here instead?
551
+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
552
+ return false ;
553
+
554
+ interleaveLeafValues (Operands);
555
+ return true ;
556
+ }
557
+
558
+ static bool
559
+ getVectorDeinterleaveFactor (IntrinsicInst *II,
560
+ SmallVectorImpl<Value *> &Results,
561
+ SmallVectorImpl<Instruction *> &DeadInsts) {
562
+ assert (II->getIntrinsicID () == Intrinsic::vector_deinterleave2);
563
+ using namespace PatternMatch ;
564
+ if (!II->hasNUses (2 ))
565
+ return false ;
566
+
567
+ // Visit with BFS
568
+ SmallVector<IntrinsicInst *, 8 > Queue;
569
+ Queue.push_back (II);
570
+ while (!Queue.empty ()) {
571
+ IntrinsicInst *Current = Queue.front ();
572
+ Queue.erase (Queue.begin ());
573
+ assert (Current->hasNUses (2 ));
574
+
575
+ // All the intermediate intrinsics will be deleted from the bottom-up.
576
+ DeadInsts.insert (DeadInsts.begin (), Current);
577
+
578
+ ExtractValueInst *LHS = nullptr , *RHS = nullptr ;
579
+ for (User *Usr : Current->users ()) {
580
+ if (!isa<ExtractValueInst>(Usr))
581
+ return 0 ;
582
+
583
+ auto *EV = cast<ExtractValueInst>(Usr);
584
+ // Intermediate ExtractValue instructions will also be deleted.
585
+ DeadInsts.insert (DeadInsts.begin (), EV);
586
+ ArrayRef<unsigned > Indices = EV->getIndices ();
587
+ if (Indices.size () != 1 )
588
+ return false ;
589
+
590
+ if (Indices[0 ] == 0 && !LHS)
591
+ LHS = EV;
592
+ else if (Indices[0 ] == 1 && !RHS)
593
+ RHS = EV;
594
+ else
595
+ return false ;
596
+ }
597
+
598
+ // We have legal indices. At this point we're either going
599
+ // to continue the traversal or push the leaf values into Results.
600
+ for (ExtractValueInst *EV : {LHS, RHS}) {
601
+ // Continue the traversal. We're playing safe here and matching only the
602
+ // expression consisting of a perfectly balanced binary tree in which all
603
+ // intermediate values are only used once.
604
+ if (EV->hasOneUse () &&
605
+ match (EV->user_back (),
606
+ m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
607
+ EV->user_back ()->hasNUses (2 )) {
608
+ auto *EVUsr = cast<IntrinsicInst>(EV->user_back ());
609
+ Queue.push_back (EVUsr);
610
+ continue ;
611
+ }
612
+
613
+ // If this is not a perfectly balanced tree, the leaf
614
+ // result types would be different.
615
+ if (!Results.empty () && EV->getType () != Results.back ()->getType ())
616
+ return false ;
617
+
618
+ // Save the leaf value.
619
+ Results.push_back (EV);
620
+ }
621
+ }
622
+
623
+ const unsigned Factor = Results.size ();
624
+ // Currently we only recognize power-of-two factors.
625
+ // FIXME: should we assert here instead?
626
+ if (Factor <= 1 || !isPowerOf2_32 (Factor))
627
+ return 0 ;
628
+
629
+ interleaveLeafValues (Results);
630
+ return true ;
631
+ }
632
+
481
633
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic (
482
634
IntrinsicInst *DI, SmallSetVector<Instruction *, 32 > &DeadInsts) {
483
635
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand (0 ));
484
636
485
637
if (!LI || !LI->hasOneUse () || !LI->isSimple ())
486
638
return false ;
487
639
488
- LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI << " \n " );
640
+ SmallVector<Value *, 8 > DeinterleaveValues;
641
+ SmallVector<Instruction *, 8 > DeinterleaveDeadInsts;
642
+ if (!getVectorDeinterleaveFactor (DI, DeinterleaveValues,
643
+ DeinterleaveDeadInsts))
644
+ return false ;
645
+
646
+ LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI
647
+ << " with factor = " << DeinterleaveValues.size () << " \n " );
489
648
490
649
// Try and match this with target specific intrinsics.
491
- SmallVector<Instruction *, 4 > DeinterleaveDeadInsts;
492
- if (!TLI->lowerDeinterleaveIntrinsicToLoad (DI, LI, DeinterleaveDeadInsts))
650
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad (LI, DeinterleaveValues))
493
651
return false ;
494
652
495
653
DeadInsts.insert (DeinterleaveDeadInsts.begin (), DeinterleaveDeadInsts.end ());
496
654
// We now have a target-specific load, so delete the old one.
497
- DeadInsts.insert (DI);
498
655
DeadInsts.insert (LI);
499
656
return true ;
500
657
}
@@ -509,16 +666,20 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
509
666
if (!SI || !SI->isSimple ())
510
667
return false ;
511
668
512
- LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II << " \n " );
669
+ SmallVector<Value *, 8 > InterleaveValues;
670
+ SmallVector<Instruction *, 8 > InterleaveDeadInsts;
671
+ if (!getVectorInterleaveFactor (II, InterleaveValues, InterleaveDeadInsts))
672
+ return false ;
673
+
674
+ LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II
675
+ << " with factor = " << InterleaveValues.size () << " \n " );
513
676
514
- SmallVector<Instruction *, 4 > InterleaveDeadInsts;
515
677
// Try and match this with target specific intrinsics.
516
- if (!TLI->lowerInterleaveIntrinsicToStore (II, SI, InterleaveDeadInsts ))
678
+ if (!TLI->lowerInterleaveIntrinsicToStore (SI, InterleaveValues ))
517
679
return false ;
518
680
519
681
// We now have a target-specific store, so delete the old one.
520
682
DeadInsts.insert (SI);
521
- DeadInsts.insert (II);
522
683
DeadInsts.insert (InterleaveDeadInsts.begin (), InterleaveDeadInsts.end ());
523
684
return true ;
524
685
}
0 commit comments