diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 59743dbe4d2ea..9411dc66b2931 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -94,6 +94,7 @@ class TargetRegisterClass; class TargetRegisterInfo; class TargetTransformInfo; class Value; +class VPIntrinsic; namespace Sched { @@ -3152,6 +3153,30 @@ class TargetLoweringBase { return false; } + /// Lower an interleaved load to target specific intrinsics. Return + /// true on success. + /// + /// \p Load is a vp.load instruction. + /// \p Mask is a mask value + /// \p DeinterleaveRes is a list of deinterleaved results. + virtual bool + lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const { + return false; + } + + /// Lower an interleaved store to target specific intrinsics. Return + /// true on success. + /// + /// \p Store is the vp.store instruction. + /// \p Mask is a mask value + /// \p InterleaveOps is a list of values being interleaved. + virtual bool + lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const { + return false; + } + /// Lower a deinterleave intrinsic to a target specific load intrinsic. /// Return true on success. Currently only supports /// llvm.vector.deinterleave2 diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 3f6a69ecb7d72..3261f2858b236 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -630,11 +630,37 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +static Value *getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { + using namespace llvm::PatternMatch; + if (auto *IMI = dyn_cast(WideMask)) { + SmallVector Operands; + SmallVector DeadInsts; + if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { + assert(!Operands.empty()); + if (Operands.size() == Factor && llvm::all_equal(Operands)) + return Operands[0]; + } + } + + if (match(WideMask, m_AllOnes())) { + // Scale the vector length of all-ones mask. + ElementCount OrigEC = + cast(WideMask->getType())->getElementCount(); + assert(OrigEC.getKnownMinValue() % Factor == 0); + return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), + cast(WideMask)->getSplatValue()); + } + + return nullptr; +} + bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector &DeadInsts) { - LoadInst *LI = dyn_cast(DI->getOperand(0)); - - if (!LI || !LI->hasOneUse() || !LI->isSimple()) + Value *LoadedVal = DI->getOperand(0); + if (!LoadedVal->hasOneUse() || !isa(LoadedVal)) return false; SmallVector DeinterleaveValues; @@ -643,16 +669,43 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( DeinterleaveDeadInsts)) return false; - LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI - << " with factor = " << DeinterleaveValues.size() << "\n"); + const unsigned Factor = DeinterleaveValues.size(); - // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) - return false; + if (auto *VPLoad = dyn_cast(LoadedVal)) { + if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) + return false; + // Check mask operand. Handle both all-true and interleaved mask. + Value *WideMask = VPLoad->getOperand(1); + Value *Mask = getMask(WideMask, Factor, + cast(DeinterleaveValues[0]->getType())); + if (!Mask) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic " + << *DI << " and factor = " << Factor << "\n"); + + // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special + // TLI function to emit target-specific interleaved instruction. + if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask, + DeinterleaveValues)) + return false; + + } else { + auto *LI = cast(LoadedVal); + if (!LI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI + << " and factor = " << Factor << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) + return false; + } DeadInsts.insert(DeinterleaveDeadInsts.begin(), DeinterleaveDeadInsts.end()); // We now have a target-specific load, so delete the old one. - DeadInsts.insert(LI); + DeadInsts.insert(cast(LoadedVal)); return true; } @@ -660,10 +713,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( IntrinsicInst *II, SmallSetVector &DeadInsts) { if (!II->hasOneUse()) return false; - - StoreInst *SI = dyn_cast(*(II->users().begin())); - - if (!SI || !SI->isSimple()) + Value *StoredBy = II->user_back(); + if (!isa(StoredBy)) return false; SmallVector InterleaveValues; @@ -671,15 +722,41 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts)) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II - << " with factor = " << InterleaveValues.size() << "\n"); + const unsigned Factor = InterleaveValues.size(); - // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) - return false; + if (auto *VPStore = dyn_cast(StoredBy)) { + if (VPStore->getIntrinsicID() != Intrinsic::vp_store) + return false; + + Value *WideMask = VPStore->getOperand(2); + Value *Mask = getMask(WideMask, Factor, + cast(InterleaveValues[0]->getType())); + if (!Mask) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " + << *II << " and factor = " << Factor << "\n"); + + // Since lowerInterleavedStore expects Shuffle and StoreInst, use special + // TLI function to emit target-specific interleaved instruction. + if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask, + InterleaveValues)) + return false; + } else { + auto *SI = cast(StoredBy); + if (!SI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II + << " and factor = " << Factor << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) + return false; + } // We now have a target-specific store, so delete the old one. - DeadInsts.insert(SI); + DeadInsts.insert(cast(StoredBy)); DeadInsts.insert(InterleaveDeadInsts.begin(), InterleaveDeadInsts.end()); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 295fd315c56da..fc59b7e690ba1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -22529,6 +22530,231 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; +} + +/// Lower an interleaved vp.load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved vp.load (Factor = 2): +/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// %dl = tail call { , } +/// @llvm.vector.deinterleave2.nxv64i8( +/// %l) +/// %r0 = extractvalue { , } %dl, 0 +/// %r1 = extractvalue { , } %dl, 1 +/// +/// Into: +/// %rvl = udiv %wide.rvl, 2 +/// %sl = call { , } +/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, +/// undef, +/// ptr %ptr, +/// %mask, +/// i64 %rvl, +/// i64 1) +/// %r0 = extractvalue { , } %sl, 0 +/// %r1 = extractvalue { , } %sl, 1 +/// +/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be +/// removed by the caller +/// TODO: We probably can loosen the dependency on matching extractvalue when +/// dealing with factor of 2 (extractvalue is still required for most of other +/// factors though). +bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( + VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveResults) const { + assert(Mask && "Expect a valid mask"); + assert(Load->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + + const unsigned Factor = DeinterleaveResults.size(); + + auto *WideVTy = dyn_cast(Load->getType()); + // TODO: Support fixed vectors. + if (!WideVTy) + return false; + + unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue(); + assert(WideNumElements % Factor == 0 && + "ElementCount of a wide load must be divisible by interleave factor"); + auto *VTy = + VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor, + WideVTy->isScalableTy()); + auto &DL = Load->getModule()->getDataLayout(); + Align Alignment = Load->getParamAlign(0).value_or( + DL.getABITypeAlign(WideVTy->getElementType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) + return false; + + IRBuilder<> Builder(Load); + Value *WideEVL = Load->getArgOperand(2); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) + return false; + + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExt( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask, + }; + + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), + NumElts * SEW / 8), + Factor); + + Value *PoisonVal = PoisonValue::get(VecTupTy); + + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); + + Value *Operands[] = { + PoisonVal, + Load->getArgOperand(0), + Mask, + EVL, + ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; + + CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); + + SmallVector AggrTypes{Factor, VTy}; + Value *Return = + PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = + Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } + + for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { + // We have to create a brand new ExtractValue to replace each + // of these old ExtractValue instructions. + Value *NewEV = + Builder.CreateExtractValue(Return, {static_cast(Idx)}); + DIO->replaceAllUsesWith(NewEV); + } + + return true; +} + +/// Lower an interleaved vp.store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved vp.store (Factor = 2): +/// +/// %is = tail call +/// @llvm.vector.interleave2.nxv64i8( +/// %load0, +/// %load1 +/// %wide.rvl = shl nuw nsw i32 %rvl, 1 +/// tail call void @llvm.vp.store.nxv64i8.p0( +/// %is, ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// +/// Into: +/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( +/// %load1, +/// %load2, ptr %ptr, +/// %mask, +/// i64 %rvl) +bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( + VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOperands) const { + assert(Mask && "Expect a valid mask"); + assert(Store->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + + const unsigned Factor = InterleaveOperands.size(); + + auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); + // TODO: Support fixed vectors. + if (!VTy) + return false; + + const DataLayout &DL = Store->getDataLayout(); + Align Alignment = Store->getParamAlign(1).value_or( + DL.getABITypeAlign(VTy->getElementType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) + return false; + + IRBuilder<> Builder(Store); + Value *WideEVL = Store->getArgOperand(3); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) + return false; + + auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExt( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask, + }; + + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), + NumElts * SEW / 8), + Factor); + + Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); + Value *StoredVal = PoisonValue::get(VecTupTy); + for (unsigned i = 0; i < Factor; ++i) + StoredVal = Builder.CreateCall( + VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); + + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); + + Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, + ConstantInt::get(XLenTy, Log2_64(SEW))}; + + Builder.CreateCall(VssegNFunc, Operands); + return true; +} + MachineInstr * RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 77605a3076a80..e9dd8ff96fa37 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -910,6 +910,14 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; + bool lowerDeinterleavedIntrinsicToVPLoad( + VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const override; + + bool lowerInterleavedIntrinsicToVPStore( + VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const override; + bool supportKCFIBundles() const override { return true; } SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll new file mode 100644 index 0000000000000..e481891dfd52f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -0,0 +1,816 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s + +define {, } @load_factor2_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } poison, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + +define {, , , } @load_factor4_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg4e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg4e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define {, , , , , , , } @load_factor8_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor8_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg8e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor8_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 35 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg8e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 8 + %wide.masked.load = call @llvm.vp.load.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.0) + %d1.0 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv8i32( %d0.1) + %d2.0 = extractvalue { , } %d2, 0 + %d2.1 = extractvalue { , } %d2, 1 + + %d3 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.0) + %t0 = extractvalue { , } %d3, 0 + %t4 = extractvalue { , } %d3, 1 + %d4 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d4, 0 + %t6 = extractvalue { , } %d4, 1 + %d5 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.0) + %t1 = extractvalue { , } %d5, 0 + %t5 = extractvalue { , } %d5, 1 + %d6 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d2.1) + %t3 = extractvalue { , } %d6, 0 + %t7 = extractvalue { , } %d6, 1 + + %res0 = insertvalue { , , , , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , , } %res5, %t6, 6 + %res7 = insertvalue { , , , , , , , } %res6, %t7, 7 + ret { , , , , , , , } %res7 +} + +define void @store_factor2_v2( %v0, %v1, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg2e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg2e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.vec = call @llvm.vector.interleave2.nxv2i32( %v0, %v1) + call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + +define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vsseg4e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vsseg4e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 8 + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + +define void @store_factor8_v2( %v0, %v1, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor8_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vmv1r.v v12, v8 +; RV32-NEXT: vmv1r.v v13, v9 +; RV32-NEXT: vmv1r.v v14, v8 +; RV32-NEXT: vmv1r.v v15, v9 +; RV32-NEXT: vsseg8e32.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor8_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 35 +; RV64-NEXT: srli a1, a1, 35 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vmv1r.v v12, v8 +; RV64-NEXT: vmv1r.v v13, v9 +; RV64-NEXT: vmv1r.v v14, v8 +; RV64-NEXT: vmv1r.v v15, v9 +; RV64-NEXT: vsseg8e32.v v8, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 8 + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + %interleaved.vec3 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec4 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec5 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec3, %interleaved.vec4) + %interleaved.vec6 = call @llvm.vector.interleave2.nxv8i32( %interleaved.vec2, %interleaved.vec5) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec6, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + +define {, } @masked_load_factor2_v2( %mask, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_load_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } poison, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + +define {, , , } @masked_load_factor4_v2( %mask, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_load_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg4e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg4e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %interleaved.mask0 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %interleaved.mask1 = call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %interleaved.mask2 = call @llvm.vector.interleave2.nxv8i1( %interleaved.mask0, %interleaved.mask1) + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, %interleaved.mask2, i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define void @masked_store_factor2_v2( %mask, %v0, %v1, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_store_factor2_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv1r.v v9, v8 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_store_factor2_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv1r.v v9, v8 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.mask = tail call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.vec = tail call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + tail call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret void +} + +define void @masked_load_store_factor2_v2_shared_mask( %mask, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_load_store_factor2_v2_shared_mask: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV32-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_store_factor2_v2_shared_mask: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t +; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %interleaved.vec = tail call @llvm.vector.interleave2.nxv4i32( %t0, %t1) + tail call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret void +} + +define i32 @masked_load_store_factor2_v2_shared_mask_extract( %mask, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_load_store_factor2_v2_shared_mask_extract: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v8, v0 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vmerge.vim v11, v9, 1, v0 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: vwaddu.vv v12, v11, v11 +; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v11, v12, a3 +; RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmsne.vi v0, v11, 0 +; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v9, a3 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: vsetvli zero, a2, e32, m2, ta, ma +; RV32-NEXT: vle32.v v10, (a0), v0.t +; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v13, v10, a1 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vnsrl.wi v12, v10, 0 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; RV32-NEXT: vsseg2e32.v v12, (a0), v0.t +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: masked_load_store_factor2_v2_shared_mask_extract: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v8, v0 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a4, a1, 33 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v11, v9, 1, v0 +; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: vwaddu.vv v12, v11, v11 +; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v11, v12, a3 +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64-NEXT: vmsne.vi v0, v11, 0 +; RV64-NEXT: add a1, a3, a3 +; RV64-NEXT: vmerge.vim v9, v9, 1, v0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v9, a3 +; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: srli a1, a4, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0), v0.t +; RV64-NEXT: li a1, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v13, v10, a1 +; RV64-NEXT: vmv.x.s a1, v10 +; RV64-NEXT: vnsrl.wi v12, v10, 0 +; RV64-NEXT: srli a4, a4, 33 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %r0 = extractelement %wide.masked.load, i32 0 + %interleaved.vec = tail call @llvm.vector.interleave2.nxv4i32( %t0, %t1) + tail call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) + ret i32 %r0 +} + +define void @masked_store_factor4_v2( %mask, %v0, %v1, ptr %ptr, i32 %evl) { +; RV32-LABEL: masked_store_factor4_v2: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmv1r.v v10, v8 +; RV32-NEXT: vmv1r.v v11, v9 +; RV32-NEXT: vsseg4e32.v v8, (a0), v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: masked_store_factor4_v2: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmv1r.v v10, v8 +; RV64-NEXT: vmv1r.v v11, v9 +; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %interleaved.mask0 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.mask1 = call @llvm.vector.interleave2.nxv2i1( %mask, %mask) + %interleaved.mask2 = call @llvm.vector.interleave2.nxv4i1( %interleaved.mask0, %interleaved.mask1) + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %v1, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec0, %interleaved.vec1) + call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec2, ptr %ptr, %interleaved.mask2, i32 %rvl) + ret void +} + +; Negative tests + +; We should not transform this function because the deinterleave tree is not in a desired form. +define {, , , } @incorrect_extract_value_index(ptr %ptr, i32 %evl) { +; RV32-LABEL: incorrect_extract_value_index: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wi v12, v8, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v9, v12, a0 +; RV32-NEXT: vnsrl.wi v8, v12, 0 +; RV32-NEXT: vmv.v.v v10, v9 +; RV32-NEXT: vmv.v.v v11, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: incorrect_extract_value_index: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wi v12, v8, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v9, v12, a0 +; RV64-NEXT: vnsrl.wi v8, v12, 0 +; RV64-NEXT: vmv.v.v v10, v9 +; RV64-NEXT: vmv.v.v v11, v9 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 0 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 1 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +; We should not transform this function because the expression is not a balanced tree. +define {, , , } @not_balanced_load_tree(ptr %ptr, i32 %evl) { +; RV32-LABEL: not_balanced_load_tree: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wx v8, v12, a0 +; RV32-NEXT: vnsrl.wi v16, v12, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wi v10, v16, 0 +; RV32-NEXT: vnsrl.wx v11, v16, a0 +; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; RV32-NEXT: vnsrl.wx v12, v11, a0 +; RV32-NEXT: vnsrl.wi v11, v11, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_balanced_load_tree: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wx v8, v12, a0 +; RV64-NEXT: vnsrl.wi v16, v12, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wi v10, v16, 0 +; RV64-NEXT: vnsrl.wx v11, v16, a0 +; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; RV64-NEXT: vnsrl.wx v12, v11, a0 +; RV64-NEXT: vnsrl.wi v11, v11, 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %t0 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t1 = extractvalue { , } %d1, 0 + %d1.1 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d1.1) + %t2 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +define void @not_balanced_store_tree( %v0, %v1, %v2, ptr %ptr, i32 %evl) { +; RV32-LABEL: not_balanced_store_tree: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV32-NEXT: vwaddu.vv v12, v8, v8 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vwmaccu.vx v12, a2, v8 +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; RV32-NEXT: vslidedown.vx v8, v12, a3 +; RV32-NEXT: add a4, a3, a3 +; RV32-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v12, v8, a3 +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vwaddu.vv v14, v12, v9 +; RV32-NEXT: vwmaccu.vx v14, a2, v9 +; RV32-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; RV32-NEXT: vwaddu.vv v16, v14, v10 +; RV32-NEXT: vwmaccu.vx v16, a2, v10 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vse32.v v16, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: not_balanced_store_tree: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV64-NEXT: vwaddu.vv v12, v8, v8 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: vwmaccu.vx v12, a2, v8 +; RV64-NEXT: srli a3, a3, 3 +; RV64-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; RV64-NEXT: vslidedown.vx v8, v12, a3 +; RV64-NEXT: add a4, a3, a3 +; RV64-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v12, v8, a3 +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vwaddu.vv v14, v12, v9 +; RV64-NEXT: vwmaccu.vx v14, a2, v9 +; RV64-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; RV64-NEXT: vwaddu.vv v16, v14, v10 +; RV64-NEXT: vwmaccu.vx v16, a2, v10 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vse32.v v16, (a0) +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %interleaved.vec0 = call @llvm.vector.interleave2.nxv2i32( %v0, %v0) + %interleaved.vec1 = call @llvm.vector.interleave2.nxv2i32( %interleaved.vec0, %v1) + %interleaved.vec2 = call @llvm.vector.interleave2.nxv4i32( %interleaved.vec1, %v2) + call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec2, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + +; We only support scalable vectors for now. +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) { +; RV32-LABEL: not_scalable_vectors: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v12, v8, a0 +; RV32-NEXT: vnsrl.wi v11, v8, 0 +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vnsrl.wx v10, v11, a0 +; RV32-NEXT: vnsrl.wi v8, v11, 0 +; RV32-NEXT: vnsrl.wx v11, v12, a0 +; RV32-NEXT: vnsrl.wi v9, v12, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_scalable_vectors: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v12, v8, a0 +; RV64-NEXT: vnsrl.wi v11, v8, 0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vnsrl.wx v10, v11, a0 +; RV64-NEXT: vnsrl.wi v8, v11, 0 +; RV64-NEXT: vnsrl.wx v11, v12, a0 +; RV64-NEXT: vnsrl.wi v9, v12, 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 4 + %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl) + %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) + %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 + %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 + %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) + %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 + %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 + %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) + %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 + %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 + + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 + %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 + %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 + %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 +} + +define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { +; RV32-LABEL: not_same_mask: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV32-NEXT: vmerge.vim v11, v8, 1, v0 +; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vmerge.vim v9, v8, 1, v0 +; RV32-NEXT: srli a3, a3, 2 +; RV32-NEXT: vwaddu.vv v12, v9, v11 +; RV32-NEXT: vwmaccu.vx v12, a2, v11 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vx v9, v12, a3 +; RV32-NEXT: vmerge.vim v10, v10, 1, v0 +; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV32-NEXT: vmsne.vi v0, v9, 0 +; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; RV32-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vx v10, v8, a3 +; RV32-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vi v0, v10, 0 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vle32.v v10, (a0), v0.t +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v9, v10, a0 +; RV32-NEXT: vnsrl.wi v8, v10, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_same_mask: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v9, v0 +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; RV64-NEXT: vmv.v.i v10, 0 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; RV64-NEXT: vmerge.vim v11, v8, 1, v0 +; RV64-NEXT: vmv1r.v v0, v9 +; RV64-NEXT: vmerge.vim v9, v8, 1, v0 +; RV64-NEXT: srli a3, a3, 2 +; RV64-NEXT: vwaddu.vv v12, v9, v11 +; RV64-NEXT: vwmaccu.vx v12, a2, v11 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vslidedown.vx v9, v12, a3 +; RV64-NEXT: vmerge.vim v10, v10, 1, v0 +; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; RV64-NEXT: vmsne.vi v0, v9, 0 +; RV64-NEXT: add a2, a3, a3 +; RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vx v10, v8, a3 +; RV64-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vi v0, v10, 0 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0), v0.t +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v9, v10, a0 +; RV64-NEXT: vnsrl.wi v8, v10, 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 2 + %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask0, %mask1) + %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) + %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 0 + %t1 = extractvalue { , } %deinterleaved.results, 1 + %res0 = insertvalue { , } poison, %t0, 0 + %res1 = insertvalue { , } %res0, %t1, 1 + ret { , } %res1 +} + +; EVL should be a multiple of factor +define {, , , } @invalid_evl(ptr %ptr, i32 %evl) { +; RV32-LABEL: invalid_evl: +; RV32: # %bb.0: +; RV32-NEXT: ori a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV32-NEXT: vnsrl.wx v12, v8, a0 +; RV32-NEXT: vnsrl.wi v14, v8, 0 +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vnsrl.wx v10, v14, a0 +; RV32-NEXT: vnsrl.wi v8, v14, 0 +; RV32-NEXT: vnsrl.wx v11, v12, a0 +; RV32-NEXT: vnsrl.wi v9, v12, 0 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_evl: +; RV64: # %bb.0: +; RV64-NEXT: ori a1, a1, 1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; RV64-NEXT: vnsrl.wx v12, v8, a0 +; RV64-NEXT: vnsrl.wi v14, v8, 0 +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vnsrl.wx v10, v14, a0 +; RV64-NEXT: vnsrl.wi v8, v14, 0 +; RV64-NEXT: vnsrl.wx v11, v12, a0 +; RV64-NEXT: vnsrl.wi v9, v12, 0 +; RV64-NEXT: ret + %rvl = or i32 %evl, 1 + %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %d0 = call { , } @llvm.vector.deinterleave2.nxv8i32( %wide.masked.load) + %d0.0 = extractvalue { , } %d0, 0 + %d0.1 = extractvalue { , } %d0, 1 + %d1 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.0) + %t0 = extractvalue { , } %d1, 0 + %t2 = extractvalue { , } %d1, 1 + %d2 = call { , } @llvm.vector.deinterleave2.nxv4i32( %d0.1) + %t1 = extractvalue { , } %d2, 0 + %t3 = extractvalue { , } %d2, 1 + + %res0 = insertvalue { , , , } poison, %t0, 0 + %res1 = insertvalue { , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , } %res2, %t3, 3 + ret { , , , } %res3 +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}