diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 33c85c7ba9d29..8eede401d0fa2 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -23943,6 +23943,92 @@ Examples: %also.r = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %ptr, i32 2, <8 x i1> %mask, <8 x i8> poison) +.. _int_experimental_vp_ff_load: + +'``llvm.experimental.vp.ff.load``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare {<4 x float>, i32} @llvm.experimental.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %mask, i32 %evl) + declare {, i32} @llvm.experimental.vp.load.ff.nxv2i16.p0(ptr %ptr, %mask, i32 %evl) + declare {<8 x float>, i32} @llvm.experimental.vp.load.ff.v8f32.p1(ptr addrspace(1) %ptr, <8 x i1> %mask, i32 %evl) + declare {, i32} @llvm.experimental.vp.load.ff.nxv1i64.p6(ptr addrspace(6) %ptr, %mask, i32 %evl) + +Overview: +""""""""" + +The '``llvm.experimental.vp.load.ff.*``' intrinsic is similar to +'``llvm.vp.load.*``', but will not trap if there are not ``evl`` readable +lanes at the pointer. '``ff``' stands for fault-first or fault-only-first. + +Arguments: +"""""""""" + +The first argument is the base pointer for the load. The second argument is a +vector of boolean values with the same number of elements as the first return +type. The third is the explicit vector length of the operation. The first +return type and underlying type of the base pointer are the same vector types. + +The :ref:`align ` parameter attribute can be provided for the first +argument. + +Semantics: +"""""""""" + +The '``llvm.experimental.vp.load.ff``' is designed for reading vector lanes in a single +IR operation where the number of lanes that can be read is not known and can +only be determined by looking at the data. This is useful for vectorizing +strcmp or strlen like loops where the data contains a null terminator. Some +targets have a fault-only-first load instruction that this intrinsic can be +lowered to. Other targets may support this intrinsic differently, for example by +lowering to a single scalar load guarded by ``evl!=0`` and ``mask[0]==1`` and +indicating only 1 lane could be read. + +Like '``llvm.vp.load``', this intrinsic reads memory based on a ``mask`` and an +``evl``. If ``evl`` is non-zero and the first lane is masked-on, then the +first lane of the vector needs to be inbounds of an allocation. The remaining +masked-on lanes with index less than ``evl`` do not need to be inbounds of +an the same allocation or any allocation. + +The second return value from the intrinsic indicates the index of the first +lane that could not be read for some reason or ``evl`` if all lanes could be +be read. Lanes at this index or higher in the first return value are +:ref:`poison value `. If ``evl`` is non-zero, the result in the +second return value must be at least 1, even if the first lane is masked-off. + +The second result is usually less than ``evl`` when an exception would occur +for reading that lane, but it can be reduced for any reason. This facilitates +emulating this intrinsic when the hardware only supports narrower vector +types natively or when when hardware does not support fault-only-first loads. + +Masked-on lanes that are not inbounds of the allocation that contains the first +lane are :ref:`poison value `. There should be a marker in the +allocation that indicates where valid data stops such as a null terminator. The +terminator should be checked for after calling this intrinsic to prevent using +any lanes past the terminator. Even if second return value is less than +``evl``, the terminator value may not have been read. + +This intrinsic will typically be called in a loop until a terminator is +found. The second result should be used to indicates how many elements are +valid to look for the null terminator. If the terminator is not found, the +pointer should be advanced by the number of elements in the second result and +the intrinsic called again. + +The default alignment is taken as the ABI alignment of the first return +type as specified by the :ref:`datalayout string`. + +Examples: +""""""""" + +.. code-block:: text + + %r = call {<8 x i8>, i32} @llvm.experimental.vp.load.ff.v8i8.p0(ptr align 2 %ptr, <8 x i1> %mask, i32 %evl) + .. _int_vp_store: '``llvm.vp.store``' Intrinsic diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 15a2370e5d8b8..3fc45237fd2e5 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1572,6 +1572,8 @@ class SelectionDAG { SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType); + SDValue getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, + SDValue Mask, SDValue EVL, MachineMemOperand *MMO); SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 20283ad8f2689..007055d88424b 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -3057,6 +3057,23 @@ class MaskedHistogramSDNode : public MaskedGatherScatterSDNode { } }; +class VPLoadFFSDNode : public MemSDNode { +public: + friend class SelectionDAG; + + VPLoadFFSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : MemSDNode(ISD::VP_LOAD_FF, Order, dl, VTs, MemVT, MMO) {} + + const SDValue &getBasePtr() const { return getOperand(1); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getVectorLength() const { return getOperand(3); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::VP_LOAD_FF; + } +}; + class FPStateAccessSDNode : public MemSDNode { public: friend class SelectionDAG; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 62239ca705b9e..f28d21af9f5ab 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1912,6 +1912,12 @@ def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty], llvm_i32_ty], [ NoCapture>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>; +def int_experimental_vp_load_ff : DefaultAttrsIntrinsic<[ llvm_anyvector_ty, llvm_i32_ty ], + [ llvm_anyptr_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ NoCapture>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>; + def int_vp_gather: DefaultAttrsIntrinsic<[ llvm_anyvector_ty], [ LLVMVectorOfAnyPointersToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 55f4719da7c8b..c210e15341697 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -587,6 +587,12 @@ VP_PROPERTY_FUNCTIONAL_OPC(Load) VP_PROPERTY_FUNCTIONAL_INTRINSIC(masked_load) END_REGISTER_VP(vp_load, VP_LOAD) +BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_load_ff, 1, 2) +// val,chain = VP_LOAD_FF chain,base,mask,evl +BEGIN_REGISTER_VP_SDNODE(VP_LOAD_FF, -1, experimental_vp_load_ff, 2, 3) +HELPER_MAP_VPID_TO_VPSD(experimental_vp_load_ff, VP_LOAD_FF) +VP_PROPERTY_NO_FUNCTIONAL +END_REGISTER_VP(experimental_vp_load_ff, VP_LOAD_FF) // llvm.experimental.vp.strided.load(ptr,stride,mask,vlen) BEGIN_REGISTER_VP_INTRINSIC(experimental_vp_strided_load, 2, 3) // chain = EXPERIMENTAL_VP_STRIDED_LOAD chain,base,offset,stride,mask,evl diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 74d7210743372..fde49c2aebea9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -958,6 +958,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, SDValue &Hi); void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi); @@ -1060,6 +1061,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); + SDValue WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N); SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 9d42ec2fdf859..fbd85d59927ff 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1163,6 +1163,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_LOAD: SplitVecRes_VP_LOAD(cast(N), Lo, Hi); break; + case ISD::VP_LOAD_FF: + SplitVecRes_VP_LOAD_FF(cast(N), Lo, Hi); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: SplitVecRes_VP_STRIDED_LOAD(cast(N), Lo, Hi); break; @@ -2232,6 +2235,45 @@ void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, ReplaceValueWith(SDValue(LD, 1), Ch); } +void DAGTypeLegalizer::SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, + SDValue &Hi) { + SDLoc dl(LD); + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(LD->getValueType(0)); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + Align Alignment = LD->getOriginalAlign(); + SDValue Mask = LD->getMask(); + SDValue EVL = LD->getVectorLength(); + + // Split Mask operand + SDValue MaskLo, MaskHi; + if (Mask.getOpcode() == ISD::SETCC) { + SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi); + } else { + if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector) + GetSplitVector(Mask, MaskLo, MaskHi); + else + std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); + } + + // Split EVL operand + auto [EVLLo, EVLHi] = DAG.SplitEVL(EVL, LD->getValueType(0), dl); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + LD->getPointerInfo(), MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), Alignment, LD->getAAInfo(), + LD->getRanges()); + + Lo = DAG.getLoadFFVP(LoVT, dl, Ch, Ptr, MaskLo, EVLLo, MMO); + + // Fill the upper half with poison. + Hi = DAG.getUNDEF(HiVT); + + ReplaceValueWith(SDValue(LD, 1), Lo.getValue(1)); + ReplaceValueWith(SDValue(LD, 2), Lo.getValue(2)); +} + void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo, SDValue &Hi) { assert(SLD->isUnindexed() && @@ -4599,6 +4641,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_LOAD: Res = WidenVecRes_VP_LOAD(cast(N)); break; + case ISD::VP_LOAD_FF: + Res = WidenVecRes_VP_LOAD_FF(cast(N)); + break; case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: Res = WidenVecRes_VP_STRIDED_LOAD(cast(N)); break; @@ -6063,6 +6108,29 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) { return Res; } +SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD_FF(VPLoadFFSDNode *N) { + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDValue Mask = N->getMask(); + SDValue EVL = N->getVectorLength(); + SDLoc dl(N); + + // The mask should be widened as well + assert(getTypeAction(Mask.getValueType()) == + TargetLowering::TypeWidenVector && + "Unable to widen binary VP op"); + Mask = GetWidenedVector(Mask); + assert(Mask.getValueType().getVectorElementCount() == + TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType()) + .getVectorElementCount() && + "Unable to widen vector load"); + + SDValue Res = DAG.getLoadFFVP(WidenVT, dl, N->getChain(), N->getBasePtr(), + Mask, EVL, N->getMemOperand()); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + ReplaceValueWith(SDValue(N, 2), Res.getValue(2)); + return Res; +} + SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) { SDLoc DL(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index df30148b78b65..f90ffb99999a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -846,6 +846,14 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { ID.AddInteger(ELD->getMemOperand()->getFlags()); break; } + case ISD::VP_LOAD_FF: { + const VPLoadFFSDNode *LD = cast(N); + ID.AddInteger(LD->getMemoryVT().getRawBits()); + ID.AddInteger(LD->getRawSubclassData()); + ID.AddInteger(LD->getPointerInfo().getAddrSpace()); + ID.AddInteger(LD->getMemOperand()->getFlags()); + break; + } case ISD::VP_STORE: { const VPStoreSDNode *EST = cast(N); ID.AddInteger(EST->getMemoryVT().getRawBits()); @@ -10123,6 +10131,34 @@ SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT, return V; } +SDValue SelectionDAG::getLoadFFVP(EVT VT, const SDLoc &dl, SDValue Chain, + SDValue Ptr, SDValue Mask, SDValue EVL, + MachineMemOperand *MMO) { + SDVTList VTs = getVTList(VT, EVL.getValueType(), MVT::Other); + SDValue Ops[] = {Chain, Ptr, Mask, EVL}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::VP_LOAD_FF, VTs, Ops); + ID.AddInteger(VT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData(dl.getIROrder(), + VTs, VT, MMO)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + ID.AddInteger(MMO->getFlags()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, + VT, MMO); + createOperands(N, Ops); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 86b99a5210924..6528001563576 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8473,6 +8473,34 @@ void SelectionDAGBuilder::visitVPLoad( setValue(&VPIntrin, LD); } +void SelectionDAGBuilder::visitVPLoadFF( + const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT, + const SmallVectorImpl &OpValues) { + assert(OpValues.size() == 3); + SDLoc DL = getCurSDLoc(); + Value *PtrOperand = VPIntrin.getArgOperand(0); + MaybeAlign Alignment = VPIntrin.getPointerAlignment(); + AAMDNodes AAInfo = VPIntrin.getAAMetadata(); + const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range); + SDValue LD; + // Do not serialize variable-length loads of constant memory with + // anything. + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); + MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); + bool AddToChain = !BatchAA || !BatchAA->pointsToConstantMemory(ML); + SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), *Alignment, AAInfo, Ranges); + LD = DAG.getLoadFFVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], + MMO); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, EVLVT, LD.getValue(1)); + if (AddToChain) + PendingLoads.push_back(LD.getValue(2)); + setValue(&VPIntrin, DAG.getMergeValues({LD.getValue(0), Trunc}, DL)); +} + void SelectionDAGBuilder::visitVPGather( const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues) { @@ -8706,6 +8734,9 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic( case ISD::VP_LOAD: visitVPLoad(VPIntrin, ValueVTs[0], OpValues); break; + case ISD::VP_LOAD_FF: + visitVPLoadFF(VPIntrin, ValueVTs[0], ValueVTs[1], OpValues); + break; case ISD::VP_GATHER: visitVPGather(VPIntrin, ValueVTs[0], OpValues); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 35c15bc269d4b..9b8d859a13f3b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -632,6 +632,8 @@ class SelectionDAGBuilder { void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); + void visitVPLoadFF(const VPIntrinsic &VPIntrin, EVT VT, EVT EVLVT, + const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, const SmallVectorImpl &OpValues); void visitVPGather(const VPIntrinsic &VPIntrin, EVT VT, diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 256bce1abe71f..7fbf201141fc8 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -448,6 +448,7 @@ VPIntrinsic::getMemoryPointerParamPos(Intrinsic::ID VPID) { case Intrinsic::experimental_vp_strided_store: return 1; case Intrinsic::vp_load: + case Intrinsic::experimental_vp_load_ff: case Intrinsic::vp_gather: case Intrinsic::experimental_vp_strided_load: return 0; @@ -671,6 +672,10 @@ Function *VPIntrinsic::getOrInsertDeclarationForParams( VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; + case Intrinsic::experimental_vp_load_ff: + VPFunc = Intrinsic::getOrInsertDeclaration( + M, VPID, {ReturnType->getStructElementType(0), Params[0]->getType()}); + break; case Intrinsic::experimental_vp_strided_load: VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()}); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6a259e4b0334c..6a104683f2887 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -880,6 +880,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::SCALAR_TO_VECTOR}, @@ -1053,6 +1054,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, {ISD::VP_LOAD, ISD::VP_STORE, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1123,6 +1125,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FABS, VT, Expand); @@ -1291,6 +1294,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::EXPERIMENTAL_VP_STRIDED_STORE, ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::ADD, ISD::MUL, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR, ISD::SDIV, ISD::SREM, ISD::UDIV, @@ -1379,6 +1383,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_SCATTER, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, ISD::EXPERIMENTAL_VP_STRIDED_STORE}, VT, Custom); + setOperationAction(ISD::VP_LOAD_FF, VT, Custom); setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom); setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT, @@ -7617,6 +7622,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::MLOAD: case ISD::VP_LOAD: return lowerMaskedLoad(Op, DAG); + case ISD::VP_LOAD_FF: + return lowerLoadFF(Op, DAG); case ISD::MSTORE: case ISD::VP_STORE: return lowerMaskedStore(Op, DAG); @@ -11966,6 +11973,52 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op, return DAG.getMergeValues({Result, Chain}, DL); } +SDValue RISCVTargetLowering::lowerLoadFF(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getResNo() == 0); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + + const auto *VPLoadFF = cast(Op); + EVT MemVT = VPLoadFF->getMemoryVT(); + MachineMemOperand *MMO = VPLoadFF->getMemOperand(); + SDValue Chain = VPLoadFF->getChain(); + SDValue BasePtr = VPLoadFF->getBasePtr(); + + SDValue Mask = VPLoadFF->getMask(); + SDValue VL = VPLoadFF->getVectorLength(); + + MVT XLenVT = Subtarget.getXLenVT(); + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + MVT MaskVT = getMaskTypeFor(ContainerVT); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + } + + unsigned IntID = Intrinsic::riscv_vleff_mask; + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(IntID, DL, XLenVT), + DAG.getUNDEF(ContainerVT), + BasePtr, + Mask, + VL, + DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT)}; + + SDVTList VTs = DAG.getVTList({ContainerVT, Op->getValueType(1), MVT::Other}); + + SDValue Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO); + SDValue OutVL = Result.getValue(1); + Chain = Result.getValue(2); + + if (VT.isFixedLengthVector()) + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + + return DAG.getMergeValues({Result, OutVL, Chain}, DL); +} + SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 26b888653c81d..8bba8c50ba862 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -991,6 +991,7 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerLoadFF(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMaskedStore(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVectorCompress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll new file mode 100644 index 0000000000000..4705f9d96191b --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vploadff.ll @@ -0,0 +1,586 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +define { <2 x i8>, i32 } @vploadff_v2i8(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i8>, i32 } @llvm.experimental.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i8>, i32 } %load +} + +define { <2 x i8>, i32 } @vploadff_v2i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i8>, i32 } @llvm.experimental.vp.load.ff.v2i8.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i8>, i32 } %load +} + +define { <4 x i8>, i32 } @vploadff_v4i8(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i8>, i32 } @llvm.experimental.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i8>, i32 } %load +} + +define { <4 x i8>, i32 } @vploadff_v4i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i8>, i32 } @llvm.experimental.vp.load.ff.v4i8.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i8>, i32 } %load +} + +define { <8 x i8>, i32 } @vploadff_v8i8(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i8>, i32 } @llvm.experimental.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i8>, i32 } %load +} + +define { <8 x i8>, i32 } @vploadff_v8i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i8>, i32 } @llvm.experimental.vp.load.ff.v8i8.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i8>, i32 } %load +} + +define { <2 x i16>, i32 } @vploadff_v2i16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i16>, i32 } @llvm.experimental.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i16>, i32 } %load +} + +define { <2 x i16>, i32 } @vploadff_v2i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i16>, i32 } @llvm.experimental.vp.load.ff.v2i16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i16>, i32 } %load +} + +define { <4 x i16>, i32 } @vploadff_v4i16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i16>, i32 } @llvm.experimental.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i16>, i32 } %load +} + +define { <4 x i16>, i32 } @vploadff_v4i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i16>, i32 } @llvm.experimental.vp.load.ff.v4i16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i16>, i32 } %load +} + +define { <8 x i16>, i32 } @vploadff_v8i16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i16>, i32 } @llvm.experimental.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i16>, i32 } %load +} + +define { <8 x i16>, i32 } @vploadff_v8i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i16>, i32 } @llvm.experimental.vp.load.ff.v8i16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i16>, i32 } %load +} + +define { <2 x i32>, i32 } @vploadff_v2i32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i32>, i32 } @llvm.experimental.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i32>, i32 } %load +} + +define { <2 x i32>, i32 } @vploadff_v2i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i32>, i32 } @llvm.experimental.vp.load.ff.v2i32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i32>, i32 } %load +} + +define { <4 x i32>, i32 } @vploadff_v4i32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i32>, i32 } @llvm.experimental.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i32>, i32 } %load +} + +define { <4 x i32>, i32 } @vploadff_v4i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i32>, i32 } @llvm.experimental.vp.load.ff.v4i32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i32>, i32 } %load +} + +define { <8 x i32>, i32 } @vploadff_v8i32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i32>, i32 } @llvm.experimental.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i32>, i32 } %load +} + +define { <8 x i32>, i32 } @vploadff_v8i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i32>, i32 } @llvm.experimental.vp.load.ff.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i32>, i32 } %load +} + +define { <2 x i64>, i32 } @vploadff_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i64>, i32 } @llvm.experimental.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <2 x i64>, i32 } @vploadff_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x i64>, i32 } @llvm.experimental.vp.load.ff.v2i64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x i64>, i32 } %load +} + +define { <4 x i64>, i32 } @vploadff_v4i64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i64>, i32 } @llvm.experimental.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x i64>, i32 } %load +} + +define { <4 x i64>, i32 } @vploadff_v4i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x i64>, i32 } @llvm.experimental.vp.load.ff.v4i64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x i64>, i32 } %load +} + +define { <8 x i64>, i32 } @vploadff_v8i64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i64>, i32 } @llvm.experimental.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x i64>, i32 } %load +} + +define { <8 x i64>, i32 } @vploadff_v8i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x i64>, i32 } @llvm.experimental.vp.load.ff.v8i64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x i64>, i32 } %load +} + +define { <32 x i64>, i32 } @vploadff_v32i64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: bltu a2, a3, .LBB24_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a1), v0.t +; CHECK-NEXT: csrr a1, vl +; CHECK-NEXT: sw a1, 256(a0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret + %load = call { <32 x i64>, i32 } @llvm.experimental.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> %m, i32 %evl) + ret { <32 x i64>, i32 } %load +} + +define { <32 x i64>, i32 } @vploadff_v32i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v32i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: bltu a2, a3, .LBB25_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a1) +; CHECK-NEXT: csrr a1, vl +; CHECK-NEXT: sw a1, 256(a0) +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: ret + %load = call { <32 x i64>, i32 } @llvm.experimental.vp.load.ff.v32i64.p0(ptr %ptr, <32 x i1> splat (i1 true), i32 %evl) + ret { <32 x i64>, i32 } %load +} + +define { <2 x half>, i32 } @vploadff_v2f16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x half>, i32 } @llvm.experimental.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x half>, i32 } %load +} + +define { <2 x half>, i32 } @vploadff_v2f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x half>, i32 } @llvm.experimental.vp.load.ff.v2f16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x half>, i32 } %load +} + +define { <4 x half>, i32 } @vploadff_v4f16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x half>, i32 } @llvm.experimental.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x half>, i32 } %load +} + +define { <4 x half>, i32 } @vploadff_v4f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x half>, i32 } @llvm.experimental.vp.load.ff.v4f16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x half>, i32 } %load +} + +define { <8 x half>, i32 } @vploadff_v8f16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x half>, i32 } @llvm.experimental.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x half>, i32 } %load +} + +define { <8 x half>, i32 } @vploadff_v8f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x half>, i32 } @llvm.experimental.vp.load.ff.v8f16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x half>, i32 } %load +} + +define { <2 x float>, i32 } @vploadff_v2f32(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x float>, i32 } @llvm.experimental.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x float>, i32 } %load +} + +define { <2 x float>, i32 } @vploadff_v2f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x float>, i32 } @llvm.experimental.vp.load.ff.v2f32.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x float>, i32 } %load +} + +define { <4 x float>, i32 } @vploadff_v4f32(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x float>, i32 } @llvm.experimental.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x float>, i32 } %load +} + +define { <4 x float>, i32 } @vploadff_v4f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x float>, i32 } @llvm.experimental.vp.load.ff.v4f32.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x float>, i32 } %load +} + +define { <8 x float>, i32 } @vploadff_v8f32(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x float>, i32 } @llvm.experimental.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x float>, i32 } %load +} + +define { <8 x float>, i32 } @vploadff_v8f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x float>, i32 } @llvm.experimental.vp.load.ff.v8f32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x float>, i32 } %load +} + +define { <2 x double>, i32 } @vploadff_v2f64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x double>, i32 } @llvm.experimental.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x double>, i32 } %load +} + +define { <2 x double>, i32 } @vploadff_v2f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x double>, i32 } @llvm.experimental.vp.load.ff.v2f64.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x double>, i32 } %load +} + +define { <4 x double>, i32 } @vploadff_v4f64(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x double>, i32 } @llvm.experimental.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x double>, i32 } %load +} + +define { <4 x double>, i32 } @vploadff_v4f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x double>, i32 } @llvm.experimental.vp.load.ff.v4f64.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x double>, i32 } %load +} + +define { <8 x double>, i32 } @vploadff_v8f64(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x double>, i32 } @llvm.experimental.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x double>, i32 } %load +} + +define { <8 x double>, i32 } @vploadff_v8f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x double>, i32 } @llvm.experimental.vp.load.ff.v8f64.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x double>, i32 } %load +} + +define { <2 x bfloat>, i32 } @vploadff_v2bf16(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> %m, i32 %evl) + ret { <2 x bfloat>, i32 } %load +} + +define { <2 x bfloat>, i32 } @vploadff_v2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v2bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <2 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v2bf16.p0(ptr %ptr, <2 x i1> splat (i1 true), i32 %evl) + ret { <2 x bfloat>, i32 } %load +} + +define { <4 x bfloat>, i32 } @vploadff_v4bf16(ptr %ptr, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> %m, i32 %evl) + ret { <4 x bfloat>, i32 } %load +} + +define { <4 x bfloat>, i32 } @vploadff_v4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v4bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <4 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v4bf16.p0(ptr %ptr, <4 x i1> splat (i1 true), i32 %evl) + ret { <4 x bfloat>, i32 } %load +} + +define { <8 x bfloat>, i32 } @vploadff_v8bf16(ptr %ptr, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> %m, i32 %evl) + ret { <8 x bfloat>, i32 } %load +} + +define { <8 x bfloat>, i32 } @vploadff_v8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v8bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <8 x bfloat>, i32 } @llvm.experimental.vp.load.ff.v8bf16.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %evl) + ret { <8 x bfloat>, i32 } %load +} + +define { <7 x i8>, i32 } @vploadff_v7i8(ptr %ptr, <7 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_v7i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { <7 x i8>, i32 } @llvm.experimental.vp.load.ff.v7i8.p0(ptr %ptr, <7 x i1> %m, i32 %evl) + ret { <7 x i8>, i32 } %load +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vploadff.ll b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll new file mode 100644 index 0000000000000..461cbf616fb84 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vploadff.ll @@ -0,0 +1,1008 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +define { , i32 } @vploadff_nxv1i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv64i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv64i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv64i8_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv64i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv64i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define @vploadff_nxv128i8(ptr %ptr, ptr %evl_out, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: bltu a2, a3, .LBB14_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB14_2: +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv128i8.p0(ptr %ptr, %m, i32 %evl) + %result0 = extractvalue { , i32 } %load, 0 + %result1 = extractvalue { , i32 } %load, 1 + store i32 %result1, ptr %evl_out + ret %result0 +} + +define @vploadff_nxv128i8_allones_mask(ptr %ptr, ptr %evl_out, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv128i8_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: bltu a2, a3, .LBB15_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv128i8.p0(ptr %ptr, splat (i1 true), i32 %evl) + %result0 = extractvalue { , i32 } %load, 0 + %result1 = extractvalue { , i32 } %load, 1 + store i32 %result1, ptr %evl_out + ret %result0 +} + +define { , i32 } @vploadff_nxv1i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32i16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32i16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32i16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32i16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16i32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16i32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8i64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8i64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8i64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32f16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32f16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32f16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32f16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32f16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f32(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16f32.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16f32_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16f32_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16f32.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f64(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f64.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8f64_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8f64_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vle64ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8f64.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv1bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv1bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv1bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv2bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv2bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv2bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv4bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv4bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv4bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv8bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv8bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv8bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv16bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv16bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv16bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32bf16(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32bf16.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv32bf16_allones_mask(ptr %ptr, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv32bf16_allones_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16ff.v v8, (a0) +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv32bf16.p0(ptr %ptr, splat (i1 true), i32 %evl) + ret { , i32 } %load +} + +define { , i32 } @vploadff_nxv3i8(ptr %ptr, %m, i32 zeroext %evl) { +; CHECK-LABEL: vploadff_nxv3i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vle8ff.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vl +; CHECK-NEXT: ret + %load = call { , i32 } @llvm.experimental.vp.load.ff.nxv3i8.p0(ptr %ptr, %m, i32 %evl) + ret { , i32 } %load +} diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index d6ad7599ce461..3644995b4054d 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -100,6 +100,9 @@ class VPIntrinsicTest : public testing::Test { "i32*>, <8 x i1>, i32) "; Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x " "i1>, i32) "; + Str << " declare {<8 x i32>, i32} " + "@llvm.experimental.vp.load.ff.v8i32.p0v8i32(<8 x " + "i32>*, <8 x i1>, i32) "; Str << "declare <8 x i32> " "@llvm.experimental.vp.strided.load.v8i32.i32(i32*, i32, <8 " "x i1>, i32) ";