From ac430b45f908894325326cf4e41a4fa2b4c7300c Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Tue, 5 Sep 2023 18:02:05 -0700 Subject: [PATCH 01/11] [NVPTX] Make i16x2 a native type and add support for instructions supporting it On sm_90 some instructions now support i16x2 which allows hardware to execute more efficiently add, min and max instructions. In order to support that we need to make i16x2 a native type in the backend. This does the necessary changes to make i16x2 a native type and adds support for the instructions natively supporting i16x2. This caused a negative test in nvptx slp to start passing. Changed the test to a positive one as the IR is correctly vectorized. --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 24 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 133 ++++- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 37 +- llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 +- llvm/test/CodeGen/NVPTX/dag-cse.ll | 4 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 10 +- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 533 ++++++++++++++++++ .../NVPTX/load-with-non-coherent-cache.ll | 4 +- llvm/test/CodeGen/NVPTX/param-load-store.ll | 15 +- llvm/test/CodeGen/NVPTX/vec-param-load.ll | 12 +- ...non-vectorizable-intrinsic-inseltpoison.ll | 57 -- .../NVPTX/non-vectorizable-intrinsic.ll | 57 -- .../NVPTX/vectorizable-intrinsic.ll | 42 ++ 13 files changed, 751 insertions(+), 179 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll delete mode 100644 llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll delete mode 100644 llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 99a7fdb9d1e21..0091acc456eb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -615,7 +615,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { // We only care about f16x2 as it's the only real vector type we // need to deal with. MVT VT = Vector.getSimpleValueType(); - if (!(VT == MVT::v2f16 || VT == MVT::v2bf16)) + if (!(VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16)) return false; // Find and record all uses of this vector that extract element 0 or 1. SmallVector E0, E1; @@ -828,6 +828,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, return Opcode_i16; case MVT::v2f16: case MVT::v2bf16: + case MVT::v2i16: return Opcode_i32; case MVT::f32: return Opcode_f32; @@ -909,9 +910,10 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { // Vector Setting unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { - assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) && + assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16 || + LoadedVT == MVT::v2i16) && "Unexpected vector type"); - // v2f16/v2bf16 is loaded using ld.b32 + // v2f16/v2bf16/v2i16 is loaded using ld.b32 fromTypeWidth = 32; } @@ -1064,7 +1066,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { // v8f16 is a special case. PTX doesn't have ld.v8.f16 // instruction. Instead, we split the vector into v2f16 chunks and // load them with ld.v4.b32. - if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) { + if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16 || EltVT == MVT::v2i16) { assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); EltVT = MVT::i32; FromType = NVPTX::PTXLdStInstCode::Untyped; @@ -1262,10 +1264,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { EltVT = EltVT.getVectorElementType(); // vectors of f16 are loaded/stored as multiples of v2f16 elements. if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || - (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16)) { - assert(NumElts % 2 == 0 && "Vector must have even number of elements"); - EltVT = N->getValueType(0); - NumElts /= 2; + (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) || + (EltVT == MVT::i16 && N->getValueType(0) == MVT::v2i16)) { + assert(NumElts % 2 == 0 && "Vector must have even number of elements"); + EltVT = N->getValueType(0); + NumElts /= 2; } } @@ -1678,7 +1681,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT ScalarVT = SimpleVT.getScalarType(); unsigned toTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { - assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) && + assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16 || + StoreVT == MVT::v2i16) && "Unexpected vector type"); // v2f16 is stored using st.b32 toTypeWidth = 32; @@ -1847,7 +1851,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { // v8f16 is a special case. PTX doesn't have st.v8.f16 // instruction. Instead, we split the vector into v2f16 chunks and // store them with st.v4.b32. - if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) { + if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16 || EltVT == MVT::v2i16) { assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); EltVT = MVT::i32; ToType = NVPTX::PTXLdStInstCode::Untyped; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index f12f4fe3af33f..3e36a72e1a53b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -133,6 +133,7 @@ static bool IsPTXVectorType(MVT VT) { case MVT::v4i8: case MVT::v2i16: case MVT::v4i16: + case MVT::v8i16: // <4 x i16x2> case MVT::v2i32: case MVT::v4i32: case MVT::v2i64: @@ -149,12 +150,13 @@ static bool IsPTXVectorType(MVT VT) { } } -static bool Isv2f16Orv2bf16Type(EVT VT) { - return (VT == MVT::v2f16 || VT == MVT::v2bf16); +static bool Isv2f16Orv2bf16Orv2i16Type(EVT VT) { + return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16); } -static bool Isf16Orbf16Type(MVT VT) { - return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16); +static bool Is16bitsType(MVT VT) { + return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || + VT.SimpleTy == MVT::i16); } /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive @@ -207,8 +209,13 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, // Vectors with an even number of f16 elements will be passed to // us as an array of v2f16/v2bf16 elements. We must match this so we // stay in sync with Ins/Outs. - if ((Isf16Orbf16Type(EltVT.getSimpleVT())) && NumElts % 2 == 0) { - EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16; + if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { + if (EltVT == MVT::f16) + EltVT = MVT::v2f16; + else if (EltVT == MVT::bf16) + EltVT = MVT::v2bf16; + else if (EltVT == MVT::i16) + EltVT = MVT::v2i16; NumElts /= 2; } for (unsigned j = 0; j != NumElts; ++j) { @@ -427,8 +434,26 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, Op, VT, IsOpSupported ? Action : NoBF16Action); }; + auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, + LegalizeAction NoI16x2Action) { + bool IsOpSupported = false; + // instructions are available on sm_90 only + switch (Op) { + case ISD::ADD: + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SUB: + IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; + break; + } + setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); + }; + addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); + addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); @@ -459,9 +484,17 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); + + // Conversion to/from i16/i16x2 is always legal. + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); + // Operations not directly supported by NVPTX. - for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, - MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { + for (MVT VT : + {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::f64, + MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } @@ -473,6 +506,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); @@ -493,10 +527,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ROTR, MVT::i32, Legal); setOperationAction(ISD::ROTL, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::v2i16, Expand); setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTR, MVT::v2i16, Expand); setOperationAction(ISD::ROTL, MVT::i8, Expand); setOperationAction(ISD::ROTR, MVT::i8, Expand); setOperationAction(ISD::BSWAP, MVT::i16, Expand); + setOperationAction(ISD::BSWAP, MVT::v2i16, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Expand); setOperationAction(ISD::BSWAP, MVT::i64, Expand); @@ -584,6 +621,22 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::CTLZ, Ty, Legal); } + setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); + + setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Expand); + setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); setOperationAction(ISD::SUBC, MVT::i32, Legal); @@ -596,6 +649,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } setOperationAction(ISD::CTTZ, MVT::i16, Expand); + setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); @@ -1318,7 +1372,7 @@ NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) return TypeSplitVector; - if (Isv2f16Orv2bf16Type(VT)) + if (Isv2f16Orv2bf16Orv2i16Type(VT)) return TypeLegal; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -2098,15 +2152,31 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { // generates good SASS in both cases. SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { - if (!(Isv2f16Orv2bf16Type(Op->getValueType(0)) && - isa(Op->getOperand(0)) && - isa(Op->getOperand(1)))) + EVT VT = Op->getValueType(0); + if (!(Isv2f16Orv2bf16Orv2i16Type(VT))) return Op; + APInt E0; + APInt E1; + if (VT == MVT::v2f16 || VT == MVT::v2bf16) { + if (!(isa(Op->getOperand(0)) && + isa(Op->getOperand(1)))) + return Op; + + E0 = cast(Op->getOperand(0)) + ->getValueAPF() + .bitcastToAPInt(); + E1 = cast(Op->getOperand(1)) + ->getValueAPF() + .bitcastToAPInt(); + } else { + assert(VT == MVT::v2i16); + if (!(isa(Op->getOperand(0)) && + isa(Op->getOperand(1)))) + return Op; - APInt E0 = - cast(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); - APInt E1 = - cast(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); + E0 = cast(Op->getOperand(0))->getAPIntValue(); + E1 = cast(Op->getOperand(1))->getAPIntValue(); + } SDValue Const = DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); @@ -2122,7 +2192,8 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Extract individual elements and select one of them. SDValue Vector = Op->getOperand(0); EVT VectorVT = Vector.getValueType(); - assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); + assert((VectorVT == MVT::v2f16 || VectorVT == MVT::v2i16) && + "Unexpected vector type."); EVT EltVT = VectorVT.getVectorElementType(); SDLoc dl(Op.getNode()); @@ -2470,7 +2541,7 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // loads and have to handle it here. - if (Isv2f16Orv2bf16Type(Op.getValueType())) { + if (Isv2f16Orv2bf16Orv2i16Type(Op.getValueType())) { LoadSDNode *Load = cast(Op); EVT MemVT = Load->getMemoryVT(); if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), @@ -2515,13 +2586,13 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // stores and have to handle it here. - if (Isv2f16Orv2bf16Type(VT) && + if (Isv2f16Orv2bf16Orv2i16Type(VT) && !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); - // v2f16 and v2bf16 don't need special handling. - if (VT == MVT::v2f16 || VT == MVT::v2bf16) + // v2f16, v2bf16 and v2i16 don't need special handling. + if (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16) return SDValue(); if (VT.isVector()) @@ -2562,6 +2633,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { case MVT::v4f32: case MVT::v8f16: // <4 x f16x2> case MVT::v8bf16: // <4 x bf16x2> + case MVT::v8i16: // <4 x i16x2> // This is a "native" vector type break; } @@ -2606,8 +2678,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { // v8f16 is a special case. PTX doesn't have st.v8.f16 // instruction. Instead, we split the vector into v2f16 chunks and // store them with st.v4.b32. - assert(Isf16Orbf16Type(EltVT.getSimpleVT()) && - "Wrong type for the vector."); + assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); Opcode = NVPTXISD::StoreV4; StoreF16x2 = true; break; @@ -2793,7 +2864,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( EVT LoadVT = EltVT; if (EltVT == MVT::i1) LoadVT = MVT::i8; - else if (Isv2f16Orv2bf16Type(EltVT)) + else if (Isv2f16Orv2bf16Orv2i16Type(EltVT)) // getLoad needs a vector type, but it can't handle // vectors which contain v2f16 or v2bf16 elements. So we must load // using i32 here and then bitcast back. @@ -2819,7 +2890,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (EltVT == MVT::i1) Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); // v2f16 was loaded as an i32. Now we must bitcast it back. - else if (Isv2f16Orv2bf16Type(EltVT)) + else if (Isv2f16Orv2bf16Orv2i16Type(EltVT)) Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); // If a promoted integer type is used, truncate down to the original @@ -5198,6 +5269,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, case MVT::v4f16: case MVT::v4f32: case MVT::v8f16: // <4 x f16x2> + case MVT::v8i16: // <4 x i16x2> // This is a "native" vector type break; } @@ -5250,11 +5322,16 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, // v8f16 is a special case. PTX doesn't have ld.v8.f16 // instruction. Instead, we split the vector into v2f16 chunks and // load them with ld.v4.b32. - assert(Isf16Orbf16Type(EltVT.getSimpleVT()) && - "Unsupported v8 vector type."); + assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); LoadF16x2 = true; Opcode = NVPTXISD::LoadV4; - EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16; + EVT VVT; + if (EltVT == MVT::f16) + VVT = MVT::v2f16; + else if (EltVT == MVT::bf16) + VVT = MVT::v2bf16; + else if (EltVT == MVT::i16) + VVT = MVT::v2i16; EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; LdResVTs = DAG.getVTList(ListVTs); break; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4d4dcca2f53e6..3d5e84c2298ca 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -165,6 +165,7 @@ class ValueToRegClass { NVPTXRegClass ret = !cond( !eq(name, "i1"): Int1Regs, !eq(name, "i16"): Int16Regs, + !eq(name, "v2i16"): Int32Regs, !eq(name, "i32"): Int32Regs, !eq(name, "i64"): Int64Regs, !eq(name, "f16"): Int16Regs, @@ -214,6 +215,12 @@ multiclass I3 { [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; } +class I16x2 : + NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), + !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"), + [(set Int32Regs:$dst, (OpNode (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>, + Requires<[hasPTX<80>, hasSM<90>]>; + // Template for instructions which take 3 int args. The instructions are // named ".s32" (e.g. "addc.cc.s32"). multiclass ADD_SUB_INT_CARRY { @@ -747,6 +754,13 @@ def SELP_f16x2rr : [(set Int32Regs:$dst, (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>; +def SELP_i16x2rr : + NVPTXInst<(outs Int32Regs:$dst), + (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p), + "selp.b32 \t$dst, $a, $b, $p;", + [(set Int32Regs:$dst, + (select Int1Regs:$p, (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>; + //----------------------------------- // Test Instructions //----------------------------------- @@ -787,6 +801,9 @@ defm SUB_i1 : ADD_SUB_i1; defm ADD : I3<"add.s", add>; defm SUB : I3<"sub.s", sub>; +def ADD16x2 : I16x2<"add.s", add>; +def SUB16x2 : I16x2<"sub.s", sub>; + // in32 and int64 addition and subtraction with carry-out. defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>; defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; @@ -826,6 +843,12 @@ defm UMAX : I3<"max.u", umax>; defm SMIN : I3<"min.s", smin>; defm UMIN : I3<"min.u", umin>; +def SMAX16x2 : I16x2<"max.s", smax>; +def UMAX16x2 : I16x2<"max.u", umax>; +def SMIN16x2 : I16x2<"min.s", smin>; +def UMIN16x2 : I16x2<"min.u", umin>; + + // // Wide multiplication // @@ -2633,7 +2656,7 @@ foreach vt = [f16, bf16] in { def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 Int16Regs:$src)>; } -foreach vt = [v2f16, v2bf16] in { +foreach vt = [v2f16, v2bf16, v2i16] in { def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 Int32Regs:$src)>; } @@ -2948,6 +2971,11 @@ def: Pat<(i16 (bitconvert (vt Int16Regs:$a))), (ProxyRegI16 Int16Regs:$a)>; } +def: Pat<(v2i16 (bitconvert (i32 Int32Regs:$a))), + (ProxyRegI32 Int32Regs:$a)>; +def: Pat<(i32 (bitconvert (v2i16 Int32Regs:$a))), + (ProxyRegI32 Int32Regs:$a)>; + // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where // we cannot specify floating-point literals in isel patterns. Therefore, we // use an integer selp to select either 1 or 0 and then cvt to floating-point. @@ -3300,6 +3328,13 @@ def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 1)), def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; +def : Pat<(i16 (extractelt (v2i16 Int32Regs:$src), 0)), + (I32toI16L Int32Regs:$src)>; +def : Pat<(i16 (extractelt (v2i16 Int32Regs:$src), 1)), + (I32toI16H Int32Regs:$src)>; +def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), + (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; + // Count leading zeros let hasSideEffects = false in { def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a), diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index b62460e8cd31f..ed9dabf39dd7a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -58,7 +58,7 @@ foreach i = 0...31 in { //===----------------------------------------------------------------------===// def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>; def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>; -def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16], 32, +def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll index 0b21cdebd87cd..bbfedf42ad548 100644 --- a/llvm/test/CodeGen/NVPTX/dag-cse.ll +++ b/llvm/test/CodeGen/NVPTX/dag-cse.ll @@ -12,8 +12,8 @@ ; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a]; ; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]}; ; -; CHECK: ld.global.v2.u16 {%[[C1:rs[0-9]+]], %[[C2:rs[0-9]+]]}, [a]; -; CHECK: st.global.v2.u16 [c], {%[[C1]], %[[C2]]}; +; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a]; +; CHECK: st.global.u32 [c], %[[C]]; define void @test1() #0 { %1 = load <2 x i8>, ptr addrspace(1) @a, align 8 diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index dba6691e30aa2..4eb538628a8de 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -993,9 +993,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( ; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] -; CHECK-DAG: mov.b32 {tmp, [[R1:%rs[0-9]+]]}, [[A]]; -; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[A]] ; CHECK: ret; define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { %r = bitcast <2 x half> %a to <2 x i16> @@ -1003,11 +1001,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { } ; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( -; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; -; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; -; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; -; CHECK-DAG: or.b32 [[R:%r[0-9]+]], [[R0]], [[R1H]]; +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_2xhalf_param_0]; ; CHECK: st.param.b32 [func_retval0+0], [[R]]; ; CHECK: ret; define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll new file mode 100644 index 0000000000000..265e2b99af215 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -0,0 +1,533 @@ +; ## Support i16x2 instructions +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-I16x2 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} +; ## No support for i16x2 instructions +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ +; RUN: -verify-machineinstrs \ +; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOI16x2 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ +; RUN: -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_53 \ +; RUN: %} + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.u32 [[R:%r[0-9+]]], 131073; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_ret_const() #0 { + ret <2 x i16> +} + +; CHECK-LABEL: test_extract_0( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_extract_0_param_0]; +; CHECK: mov.b32 {[[RS:%rs[0-9]+]], tmp}, [[A]]; +; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i16 @test_extract_0(<2 x i16> %a) #0 { + %e = extractelement <2 x i16> %a, i32 0 + ret i16 %e +} + +; CHECK-LABEL: test_extract_1( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_extract_1_param_0]; +; CHECK: mov.b32 {tmp, [[RS:%rs[0-9]+]]}, [[A]]; +; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i16 @test_extract_1(<2 x i16> %a) #0 { + %e = extractelement <2 x i16> %a, i32 1 + ret i16 %e +} + +; CHECK-LABEL: test_extract_i( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_extract_i_param_0]; +; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; +; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; +; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; +; CHECK: selp.b16 [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; +; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { + %e = extractelement <2 x i16> %a, i64 %idx + ret i16 %e +} + +; CHECK-LABEL: test_add( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_param_1]; +; +; CHECK-I16x2-NEXT: add.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { + %r = add <2 x i16> %a, %b + ret <2 x i16> %r +} + +; Check that we can lower add with immediate arguments. +; CHECK-LABEL: test_add_imm_0( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_imm_0_param_0]; +; +; CHECK-I16x2: mov.u32 [[I:%r[0-9+]]], 131073; +; CHECK-I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; +; CHECK-NOI16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { + %r = add <2 x i16> , %a + ret <2 x i16> %r +} + +; CHECK-LABEL: test_add_imm_1( +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_imm_1_param_0]; +; +; CHECK-I16x2: mov.u32 [[I:%r[0-9+]]], 131073; +; CHECK-I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; +; CHECK-NOI16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { + %r = add <2 x i16> %a, + ret <2 x i16> %r +} + +; CHECK-LABEL: test_sub( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sub_param_0]; +; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_sub_param_1]; +; CHECK-I16x2: sub.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: sub.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: sub.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { + %r = sub <2 x i16> %a, %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_smax( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smax_param_0]; +; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smax_param_1]; +; CHECK-I16x2: max.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: setp.gt.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: setp.gt.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp sgt <2 x i16> %a, %b + %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_umax( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umax_param_0]; +; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umax_param_1]; +; CHECK-I16x2: max.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: setp.gt.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: setp.gt.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp ugt <2 x i16> %a, %b + %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_smin( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smin_param_0]; +; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smin_param_1]; +; CHECK-I16x2: min.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: setp.le.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: setp.le.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp sle <2 x i16> %a, %b + %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_umin( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umin_param_0]; +; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umin_param_1]; +; CHECK-I16x2: min.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-NOI16x2-DAG: setp.le.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: setp.le.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; +; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp ule <2 x i16> %a, %b + %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_mul( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_mul_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_mul_param_1]; +; +; CHECK-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; CHECK-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; CHECK-DAG: mul.lo.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-DAG: mul.lo.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { + %r = mul <2 x i16> %a, %b + ret <2 x i16> %r +} + + +; CHECK-LABEL: .func test_ldst_v2i16( +; CHECK-DAG: ld.param.u64 [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0]; +; CHECK-DAG: ld.param.u64 [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1]; +; CHECK-DAG: ld.u32 [[E:%r[0-9]+]], [[[A]]]; +; CHECK-DAG: st.u32 [[[B]]], [[E]]; +; CHECK: ret; +define void @test_ldst_v2i16(ptr %a, ptr %b) { + %t1 = load <2 x i16>, ptr %a + store <2 x i16> %t1, ptr %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v3i16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3i16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3i16_param_1]; +; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair +; number of bitshifting instructions that may change at llvm's whim. +; So we only verify that we only issue correct number of writes using +; correct offset, but not the values we write. +; CHECK-DAG: ld.u64 +; CHECK-DAG: st.u32 [%[[B]]], +; CHECK-DAG: st.u16 [%[[B]]+4], +; CHECK: ret; +define void @test_ldst_v3i16(ptr %a, ptr %b) { + %t1 = load <3 x i16>, ptr %a + store <3 x i16> %t1, ptr %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v4i16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4i16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4i16_param_1]; +; CHECK-DAG: ld.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.u16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v4i16(ptr %a, ptr %b) { + %t1 = load <4 x i16>, ptr %a + store <4 x i16> %t1, ptr %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v8i16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8i16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8i16_param_1]; +; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v8i16(ptr %a, ptr %b) { + %t1 = load <8 x i16>, ptr %a + store <8 x i16> %t1, ptr %b, align 16 + ret void +} + +declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[A]]; +; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { + %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) + ret <2 x i16> %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { + %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) + ret <2 x i16> %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { + %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) + ret <2 x i16> %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_param_1]; +; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { + %r = select i1 %c, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_param_3]; +; CHECK-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] +; CHECK-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] +; CHECK-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 { + %cc = icmp ne <2 x i16> %c, %d + %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + +; CHECK-LABEL: test_select_cc_i32_i16( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_0]; +; CHECK-DAG: ld.param.v2.u32 {[[B0:%r[0-9]+]], [[B1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_1]; +; CHECK-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_i32_i16_param_2]; +; CHECK-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_i32_i16_param_3]; +; CHECK-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] +; CHECK-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] +; CHECK-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: selp.b32 [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b32 [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, + <2 x i16> %c, <2 x i16> %d) #0 { + %cc = icmp ne <2 x i16> %c, %d + %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %r +} + +; CHECK-LABEL: test_select_cc_i16_i32( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_i16_i32_param_0]; +; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_i16_i32_param_1]; +; CHECK-DAG: ld.param.v2.u32 {[[C0:%r[0-9]+]], [[C1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_2]; +; CHECK-DAG: ld.param.v2.u32 {[[D0:%r[0-9]+]], [[D1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_3]; +; CHECK-DAG: setp.ne.s32 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.ne.s32 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, + <2 x i32> %c, <2 x i32> %d) #0 { + %cc = icmp ne <2 x i32> %c, %d + %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %r +} + + +; CHECK-LABEL: test_trunc_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_trunc_2xi32_param_0]; +; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { + %r = trunc <2 x i32> %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_trunc_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_trunc_2xi64_param_0]; +; CHECK-DAG: cvt.u16.u64 [[R0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.u16.u64 [[R1:%rs[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { + %r = trunc <2 x i64> %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_zext_2xi32( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi32_param_0]; +; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { + %r = zext <2 x i16> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_zext_2xi64( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi64_param_0]; +; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.u64.u16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.u64.u16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { + %r = zext <2 x i16> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_bitcast_i32_to_2xi16( +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { + %r = bitcast i32 %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_bitcast_2xi16_to_i32( +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { + %r = bitcast <2 x i16> %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_shufflevector( +; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_shufflevector_param_0]; +; CHECK: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]]; +; CHECK: mov.b32 [[R1:%r[0-9]+]], {[[RS1]], [[RS0]]}; +; CHECK: st.param.b32 [func_retval0+0], [[R1]]; +; CHECK: ret; +define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { + %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> + ret <2 x i16> %s +} + +; CHECK-LABEL: test_insertelement( +; CHECK: ld.param.u16 [[B:%rs[0-9]+]], [test_insertelement_param_1]; +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_insertelement_param_0]; +; CHECK: { .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; } +; CHECK: mov.b32 [[R1:%r[0-9]+]], {[[R0]], [[B]]}; +; CHECK: st.param.b32 [func_retval0+0], [[R1]]; +; CHECK: ret; +define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { + %i = insertelement <2 x i16> %a, i16 %x, i64 1 + ret <2 x i16> %i +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index aa9a1280abcd6..9012339fb6b1e 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -80,9 +80,9 @@ define void @foo7(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo8( -; SM20: ld.global.v2.u16 +; SM20: ld.global.u32 ; SM35-LABEL: .visible .entry foo8( -; SM35: ld.global.nc.v2.u16 +; SM35: ld.global.nc.u32 define void @foo8(ptr noalias readonly %from, ptr %to) { %1 = load <2 x i16>, ptr %from store <2 x i16> %1, ptr %to diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index 313a0915d2030..2d87271e30ae0 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -326,7 +326,8 @@ define signext i16 @test_i16s(i16 signext %a) { ; CHECK-LABEL: test_v3i16( ; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] ; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK-DAG: ld.param.u32 [[R:%r[0-9]+]], [test_v3i16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R]]; ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; ; CHECK: st.param.b16 [param0+4], [[E2]]; @@ -346,14 +347,14 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v4i16( ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] -; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; ; CHECK: .param .align 8 .b8 retval0[8]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v4i16, -; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]} ; CHECK-NEXT: ret; define <4 x i16> @test_v4i16(<4 x i16> %a) { %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); @@ -365,6 +366,10 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) { ; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] ; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; ; CHECK-DAG: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK-DAG: mov.b32 [[R0:%r[0-9]+]], {[[E0]], [[E1]]}; +; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R0]]; +; CHECK-DAG: mov.b32 [[R1:%r[0-9]+]], {[[E2]], [[E3]]}; +; CHECK-DAG: mov.b32 {[[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [[R1]]; ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index f8b16c1feab5e..f4f5c26be3474 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -70,14 +70,10 @@ define <8 x i64> @test_v8i64(<8 x i64> %a) { define <16 x i16> @test_v16i16(<16 x i16> %a) { ; CHECK-LABEL: test_v16i16( -; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24]; -; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; -; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8]; -; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]} -; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]} -; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]} +; CHECK-DAG: ld.param.v4.u32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.u32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[V_0_7]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_8_15]]} ; CHECK: ret; ret <16 x i16> %a } diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll deleted file mode 100644 index aeb4aa8078aa2..0000000000000 --- a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic-inseltpoison.ll +++ /dev/null @@ -1,57 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s - -target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64" -target triple = "nvptx--nvidiacl" - -; CTLZ cannot be vectorized currently because the second argument is a scalar -; for both the scalar and vector forms of the intrinsic. In the future it -; should be possible to vectorize such functions. -; Test causes an assert if LLVM tries to vectorize CTLZ. - -define <2 x i8> @cltz_test(<2 x i8> %x) #0 { -; CHECK-LABEL: @cltz_test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false) -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1 -; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false) -; CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1 -; CHECK-NEXT: ret <2 x i8> [[VECINIT2]] -; -entry: - %0 = extractelement <2 x i8> %x, i32 0 - %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) - %vecinit = insertelement <2 x i8> poison, i8 %call.i, i32 0 - %1 = extractelement <2 x i8> %x, i32 1 - %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) - %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 - ret <2 x i8> %vecinit2 -} - -define <2 x i8> @cltz_test2(<2 x i8> %x) #1 { -; CHECK-LABEL: @cltz_test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1 -; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false) -; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false) -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0 -; CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1 -; CHECK-NEXT: ret <2 x i8> [[VECINIT2]] -; -entry: - %0 = extractelement <2 x i8> %x, i32 0 - %1 = extractelement <2 x i8> %x, i32 1 - %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) - %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) - %vecinit = insertelement <2 x i8> poison, i8 %call.i, i32 0 - %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 - ret <2 x i8> %vecinit2 -} - -declare i8 @llvm.ctlz.i8(i8, i1) #3 - -attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll deleted file mode 100644 index 4f92fa7c3d0f4..0000000000000 --- a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll +++ /dev/null @@ -1,57 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s - -target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64" -target triple = "nvptx--nvidiacl" - -; CTLZ cannot be vectorized currently because the second argument is a scalar -; for both the scalar and vector forms of the intrinsic. In the future it -; should be possible to vectorize such functions. -; Test causes an assert if LLVM tries to vectorize CTLZ. - -define <2 x i8> @cltz_test(<2 x i8> %x) #0 { -; CHECK-LABEL: @cltz_test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false) -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1 -; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false) -; CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1 -; CHECK-NEXT: ret <2 x i8> [[VECINIT2]] -; -entry: - %0 = extractelement <2 x i8> %x, i32 0 - %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) - %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 - %1 = extractelement <2 x i8> %x, i32 1 - %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) - %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 - ret <2 x i8> %vecinit2 -} - -define <2 x i8> @cltz_test2(<2 x i8> %x) #1 { -; CHECK-LABEL: @cltz_test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1 -; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false) -; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false) -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0 -; CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1 -; CHECK-NEXT: ret <2 x i8> [[VECINIT2]] -; -entry: - %0 = extractelement <2 x i8> %x, i32 0 - %1 = extractelement <2 x i8> %x, i32 1 - %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) - %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) - %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 - %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 - ret <2 x i8> %vecinit2 -} - -declare i8 @llvm.ctlz.i8(i8, i1) #3 - -attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll new file mode 100644 index 0000000000000..e6ad2adba1759 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -passes=slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s + +target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx--nvidiacl" + +; Test that CTLZ can be vectorized currently even though the second argument is a scalar + +define <2 x i8> @cltz_test(<2 x i8> %x) #0 { +; CHECK-LABEL: @cltz_test( +; CHECK: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %{{.*}}, i1 false) +; CHECK-NEXT: ret <2 x i8> [[VEC]] +; +entry: + %0 = extractelement <2 x i8> %x, i32 0 + %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) + %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 + %1 = extractelement <2 x i8> %x, i32 1 + %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) + %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 + ret <2 x i8> %vecinit2 +} + + +define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 { +; CHECK-LABEL: @cltz_test_poison( +; CHECK: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %{{.*}}, i1 false) +; CHECK-NEXT: ret <2 x i8> [[VEC]] +; +entry: + %0 = extractelement <2 x i8> %x, i32 0 + %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) + %vecinit = insertelement <2 x i8> poison, i8 %call.i, i32 0 + %1 = extractelement <2 x i8> %x, i32 1 + %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) + %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 + ret <2 x i8> %vecinit2 +} + +declare i8 @llvm.ctlz.i8(i8, i1) #3 + +attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } From 9520bf8499a81ba464224957ed798de48d184f8d Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 11:33:10 -0700 Subject: [PATCH 02/11] Scalarize i16x2 op when not natively support instead of expanding --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 58 ++++++++++++++----- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 30 ++++------ 2 files changed, 55 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3e36a72e1a53b..170a6b58df69c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -621,21 +621,21 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::CTLZ, Ty, Legal); } - setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Expand); - setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Expand); + setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); + setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -2418,7 +2418,26 @@ SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); } - +static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + if (Op.getValueType() != MVT::v2i16) + return Op; + EVT EltVT = Op.getValueType().getVectorElementType(); + SmallVector VecElements; + for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { + SmallVector ScalarArgs; + for (int J = 0, NumOp = Op.getNumOperands(); J < NumOp; J++) { + SDValue Ext = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op->getOperand(J), + DAG.getIntPtrConstant(I, DL)); + ScalarArgs.push_back(Ext); + } + VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); + } + SDValue V = + DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); + return V; +} SDValue NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -2456,6 +2475,19 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerVAARG(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::ABS: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::MUL: + case ISD::SHL: + case ISD::SREM: + case ISD::UREM: + return LowerVectorArith(Op, DAG); default: llvm_unreachable("Custom lowering not defined for operation"); } diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 265e2b99af215..8a8d7d913780c 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -9,13 +9,11 @@ ; RUN: %} ; ## No support for i16x2 instructions ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ -; RUN: -verify-machineinstrs \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOI16x2 %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -frame-pointer=all --nvptx-no-f16-math \ -; RUN: -verify-machineinstrs \ +; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_53 \ ; RUN: %} @@ -148,10 +146,8 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: setp.gt.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: setp.gt.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: max.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; ; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; @@ -170,10 +166,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: setp.gt.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: setp.gt.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: max.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; ; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; @@ -192,10 +186,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: setp.le.s16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: setp.le.s16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: min.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; ; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; @@ -214,10 +206,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; ; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: setp.le.u16 [[P0:%p[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: setp.le.u16 [[P1:%p[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]], [[P0]]; -; CHECK-NOI16x2-DAG: selp.b16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]], [[P1]]; +; CHECK-NOI16x2-DAG: min.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; CHECK-NOI16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; ; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; ; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; From 1a4acc7f7c788599dbce99a6d511361c6a3115c2 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 14:13:56 -0700 Subject: [PATCH 03/11] Address review comments --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 6 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 31 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 1 + llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 + llvm/lib/Target/NVPTX/NVPTXUtilities.h | 3 + llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 592 +++++++++--------- 6 files changed, 319 insertions(+), 318 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0091acc456eb4..72c1b82f0fe4e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1681,10 +1681,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { MVT ScalarVT = SimpleVT.getScalarType(); unsigned toTypeWidth = ScalarVT.getSizeInBits(); if (SimpleVT.isVector()) { - assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16 || - StoreVT == MVT::v2i16) && - "Unexpected vector type"); - // v2f16 is stored using st.b32 + assert(Isv2x16VT(StoreVT) && "Unexpected vector type"); + // v2x16 is stored using st.b32 toTypeWidth = 32; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 170a6b58df69c..2be7aa2f2efbf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -150,10 +150,6 @@ static bool IsPTXVectorType(MVT VT) { } } -static bool Isv2f16Orv2bf16Orv2i16Type(EVT VT) { - return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16); -} - static bool Is16bitsType(MVT VT) { return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || VT.SimpleTy == MVT::i16); @@ -1372,7 +1368,7 @@ NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) return TypeSplitVector; - if (Isv2f16Orv2bf16Orv2i16Type(VT)) + if (Isv2x16VT(VT)) return TypeLegal; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -2153,7 +2149,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op->getValueType(0); - if (!(Isv2f16Orv2bf16Orv2i16Type(VT))) + if (!(Isv2x16VT(VT))) return Op; APInt E0; APInt E1; @@ -2192,8 +2188,7 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Extract individual elements and select one of them. SDValue Vector = Op->getOperand(0); EVT VectorVT = Vector.getValueType(); - assert((VectorVT == MVT::v2f16 || VectorVT == MVT::v2i16) && - "Unexpected vector type."); + assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); EVT EltVT = VectorVT.getVectorElementType(); SDLoc dl(Op.getNode()); @@ -2571,9 +2566,9 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) return LowerLOADi1(Op, DAG); - // v2f16 is legal, so we can't rely on legalizer to handle unaligned - // loads and have to handle it here. - if (Isv2f16Orv2bf16Orv2i16Type(Op.getValueType())) { + // v2f16/v2bf16/v2i16 are legal, so we can't rely on legalizer to handle + // unaligned loads and have to handle it here. + if (Isv2x16VT(Op.getValueType())) { LoadSDNode *Load = cast(Op); EVT MemVT = Load->getMemoryVT(); if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), @@ -2618,13 +2613,13 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // stores and have to handle it here. - if (Isv2f16Orv2bf16Orv2i16Type(VT) && + if (Isv2x16VT(VT) && !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); // v2f16, v2bf16 and v2i16 don't need special handling. - if (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16) + if (Isv2x16VT(VT)) return SDValue(); if (VT.isVector()) @@ -2896,7 +2891,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( EVT LoadVT = EltVT; if (EltVT == MVT::i1) LoadVT = MVT::i8; - else if (Isv2f16Orv2bf16Orv2i16Type(EltVT)) + else if (Isv2x16VT(EltVT)) // getLoad needs a vector type, but it can't handle // vectors which contain v2f16 or v2bf16 elements. So we must load // using i32 here and then bitcast back. @@ -2922,7 +2917,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (EltVT == MVT::i1) Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); // v2f16 was loaded as an i32. Now we must bitcast it back. - else if (Isv2f16Orv2bf16Orv2i16Type(EltVT)) + else if (Isv2x16VT(EltVT)) Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); // If a promoted integer type is used, truncate down to the original @@ -5335,7 +5330,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, unsigned Opcode = 0; SDVTList LdResVTs; - bool LoadF16x2 = false; + bool Load16x2 = false; switch (NumElts) { default: @@ -5355,7 +5350,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, // instruction. Instead, we split the vector into v2f16 chunks and // load them with ld.v4.b32. assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); - LoadF16x2 = true; + Load16x2 = true; Opcode = NVPTXISD::LoadV4; EVT VVT; if (EltVT == MVT::f16) @@ -5382,7 +5377,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LD->getMemOperand()); SmallVector ScalarRes; - if (LoadF16x2) { + if (Load16x2) { // Split v2f16 subvectors back into individual elements. NumElts /= 2; for (unsigned i = 0; i < NumElts; ++i) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index ccd80359bf80b..87cd52c954a7b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -617,6 +617,7 @@ class NVPTXTargetLowering : public TargetLowering { Align getArgumentAlignment(SDValue Callee, const CallBase *CB, Type *Ty, unsigned Idx, const DataLayout &DL) const; }; + } // namespace llvm #endif diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 988910810da65..c3737f9fcca82 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -348,4 +348,8 @@ bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM) { !isKernelFunction(*F); } +bool Isv2x16VT(EVT VT) { + return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16); +} + } // namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index f980ea3dec0b8..521f8198911f2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -74,6 +75,8 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { } bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); + +bool Isv2x16VT(EVT VT); } #endif diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 8a8d7d913780c..5f27594ef29d7 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -1,7 +1,7 @@ ; ## Support i16x2 instructions ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-I16x2 %s +; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ @@ -10,7 +10,7 @@ ; ## No support for i16x2 instructions ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes CHECK,CHECK-NOI16x2 %s +; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s ; RUN: %if ptxas %{ \ ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ @@ -19,270 +19,270 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" -; CHECK-LABEL: test_ret_const( -; CHECK: mov.u32 [[R:%r[0-9+]]], 131073; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_ret_const( +; COMMON: mov.u32 [[R:%r[0-9+]]], 131073; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_ret_const() #0 { ret <2 x i16> } -; CHECK-LABEL: test_extract_0( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_extract_0_param_0]; -; CHECK: mov.b32 {[[RS:%rs[0-9]+]], tmp}, [[A]]; -; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_extract_0( +; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_0_param_0]; +; COMMON: mov.b32 {[[RS:%rs[0-9]+]], tmp}, [[A]]; +; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define i16 @test_extract_0(<2 x i16> %a) #0 { %e = extractelement <2 x i16> %a, i32 0 ret i16 %e } -; CHECK-LABEL: test_extract_1( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_extract_1_param_0]; -; CHECK: mov.b32 {tmp, [[RS:%rs[0-9]+]]}, [[A]]; -; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_extract_1( +; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_extract_1_param_0]; +; COMMON: mov.b32 {tmp, [[RS:%rs[0-9]+]]}, [[A]]; +; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define i16 @test_extract_1(<2 x i16> %a) #0 { %e = extractelement <2 x i16> %a, i32 1 ret i16 %e } -; CHECK-LABEL: test_extract_i( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_extract_i_param_0]; -; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; -; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; -; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; -; CHECK: selp.b16 [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_extract_i( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_extract_i_param_0]; +; COMMON-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; +; COMMON-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; +; COMMON-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[A]]; +; COMMON: selp.b16 [[RS:%rs[0-9]+]], [[E0]], [[E1]], [[PRED]]; +; COMMON: cvt.u32.u16 [[R:%r[0-9]+]], [[RS]]; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { %e = extractelement <2 x i16> %a, i64 %idx ret i16 %e } -; CHECK-LABEL: test_add( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_param_1]; +; COMMON-LABEL: test_add( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_param_1]; ; -; CHECK-I16x2-NEXT: add.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; I16x2-NEXT: add.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { %r = add <2 x i16> %a, %b ret <2 x i16> %r } ; Check that we can lower add with immediate arguments. -; CHECK-LABEL: test_add_imm_0( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_imm_0_param_0]; +; COMMON-LABEL: test_add_imm_0( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_add_imm_0_param_0]; ; -; CHECK-I16x2: mov.u32 [[I:%r[0-9+]]], 131073; -; CHECK-I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; +; I16x2: mov.u32 [[I:%r[0-9+]]], 131073; +; I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; -; CHECK-NOI16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; +; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { %r = add <2 x i16> , %a ret <2 x i16> %r } -; CHECK-LABEL: test_add_imm_1( -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_imm_1_param_0]; +; COMMON-LABEL: test_add_imm_1( +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_add_imm_1_param_0]; ; -; CHECK-I16x2: mov.u32 [[I:%r[0-9+]]], 131073; -; CHECK-I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; +; I16x2: mov.u32 [[I:%r[0-9+]]], 131073; +; I16x2: add.s16x2 [[R:%r[0-9]+]], [[A]], [[I]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; -; CHECK-NOI16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: add.s16 [[RS2:%rs[0-9]+]], [[RS0]], 1; +; NO-I16x2-DAG: add.s16 [[RS3:%rs[0-9]+]], [[RS1]], 2; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS2]], [[RS3]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { %r = add <2 x i16> %a, ret <2 x i16> %r } -; CHECK-LABEL: test_sub( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sub_param_0]; +; COMMON-LABEL: test_sub( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sub_param_0]; ; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_sub_param_1]; -; CHECK-I16x2: sub.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_sub_param_1]; +; I16x2: sub.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: sub.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: sub.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: sub.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: sub.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { %r = sub <2 x i16> %a, %b ret <2 x i16> %r } -; CHECK-LABEL: test_smax( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smax_param_0]; +; COMMON-LABEL: test_smax( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smax_param_0]; ; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smax_param_1]; -; CHECK-I16x2: max.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smax_param_1]; +; I16x2: max.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: max.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: max.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: max.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp sgt <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_umax( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umax_param_0]; +; COMMON-LABEL: test_umax( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umax_param_0]; ; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umax_param_1]; -; CHECK-I16x2: max.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umax_param_1]; +; I16x2: max.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: max.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: max.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: max.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp ugt <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_smin( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smin_param_0]; +; COMMON-LABEL: test_smin( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_smin_param_0]; ; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smin_param_1]; -; CHECK-I16x2: min.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_smin_param_1]; +; I16x2: min.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: min.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: min.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: min.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp sle <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_umin( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umin_param_0]; +; COMMON-LABEL: test_umin( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_umin_param_0]; ; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umin_param_1]; -; CHECK-I16x2: min.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_umin_param_1]; +; I16x2: min.u16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-NOI16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-NOI16x2-DAG: min.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-NOI16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-NOI16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: min.u16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: min.u16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { %cmp = icmp ule <2 x i16> %a, %b %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_mul( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_mul_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_mul_param_1]; +; COMMON-LABEL: test_mul( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_mul_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_mul_param_1]; ; -; CHECK-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; CHECK-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; CHECK-DAG: mul.lo.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; CHECK-DAG: mul.lo.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; CHECK-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; COMMON-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; COMMON-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; COMMON-DAG: mul.lo.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; COMMON-DAG: mul.lo.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; COMMON-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { %r = mul <2 x i16> %a, %b ret <2 x i16> %r } -; CHECK-LABEL: .func test_ldst_v2i16( -; CHECK-DAG: ld.param.u64 [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0]; -; CHECK-DAG: ld.param.u64 [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1]; -; CHECK-DAG: ld.u32 [[E:%r[0-9]+]], [[[A]]]; -; CHECK-DAG: st.u32 [[[B]]], [[E]]; -; CHECK: ret; +; COMMON-LABEL: .func test_ldst_v2i16( +; COMMON-DAG: ld.param.u64 [[A:%rd[0-9]+]], [test_ldst_v2i16_param_0]; +; COMMON-DAG: ld.param.u64 [[B:%rd[0-9]+]], [test_ldst_v2i16_param_1]; +; COMMON-DAG: ld.u32 [[E:%r[0-9]+]], [[[A]]]; +; COMMON-DAG: st.u32 [[[B]]], [[E]]; +; COMMON: ret; define void @test_ldst_v2i16(ptr %a, ptr %b) { %t1 = load <2 x i16>, ptr %a store <2 x i16> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v3i16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3i16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3i16_param_1]; +; COMMON-LABEL: .func test_ldst_v3i16( +; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3i16_param_0]; +; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3i16_param_1]; ; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair ; number of bitshifting instructions that may change at llvm's whim. ; So we only verify that we only issue correct number of writes using ; correct offset, but not the values we write. -; CHECK-DAG: ld.u64 -; CHECK-DAG: st.u32 [%[[B]]], -; CHECK-DAG: st.u16 [%[[B]]+4], -; CHECK: ret; +; COMMON-DAG: ld.u64 +; COMMON-DAG: st.u32 [%[[B]]], +; COMMON-DAG: st.u16 [%[[B]]+4], +; COMMON: ret; define void @test_ldst_v3i16(ptr %a, ptr %b) { %t1 = load <3 x i16>, ptr %a store <3 x i16> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v4i16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4i16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4i16_param_1]; -; CHECK-DAG: ld.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.u16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; +; COMMON-LABEL: .func test_ldst_v4i16( +; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4i16_param_0]; +; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4i16_param_1]; +; COMMON-DAG: ld.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [%[[A]]]; +; COMMON-DAG: st.v4.u16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; COMMON: ret; define void @test_ldst_v4i16(ptr %a, ptr %b) { %t1 = load <4 x i16>, ptr %a store <4 x i16> %t1, ptr %b, align 16 ret void } -; CHECK-LABEL: .func test_ldst_v8i16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8i16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8i16_param_1]; -; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; +; COMMON-LABEL: .func test_ldst_v8i16( +; COMMON-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8i16_param_0]; +; COMMON-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8i16_param_1]; +; COMMON-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; +; COMMON-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; COMMON: ret; define void @test_ldst_v8i16(ptr %a, ptr %b) { %t1 = load <8 x i16>, ptr %a store <8 x i16> %t1, ptr %b, align 16 @@ -291,117 +291,117 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) { declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0 -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[A]]; -; CHECK-DAG: st.param.b32 [param1+0], [[B]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_call( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_param_1]; +; COMMON: { +; COMMON-DAG: .param .align 4 .b8 param0[4]; +; COMMON-DAG: .param .align 4 .b8 param1[4]; +; COMMON-DAG: st.param.b32 [param0+0], [[A]]; +; COMMON-DAG: st.param.b32 [param1+0], [[B]]; +; COMMON-DAG: .param .align 4 .b8 retval0[4]; +; COMMON: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON: ); +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: } +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) ret <2 x i16> %r } -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_call_flipped( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_call_flipped_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_call_flipped_param_1]; +; COMMON: { +; COMMON-DAG: .param .align 4 .b8 param0[4]; +; COMMON-DAG: .param .align 4 .b8 param1[4]; +; COMMON-DAG: st.param.b32 [param0+0], [[B]]; +; COMMON-DAG: st.param.b32 [param1+0], [[A]]; +; COMMON-DAG: .param .align 4 .b8 retval0[4]; +; COMMON: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON: ); +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: } +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) ret <2 x i16> %r } -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_tailcall_flipped( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_tailcall_flipped_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_tailcall_flipped_param_1]; +; COMMON: { +; COMMON-DAG: .param .align 4 .b8 param0[4]; +; COMMON-DAG: .param .align 4 .b8 param1[4]; +; COMMON-DAG: st.param.b32 [param0+0], [[B]]; +; COMMON-DAG: st.param.b32 [param1+0], [[A]]; +; COMMON-DAG: .param .align 4 .b8 retval0[4]; +; COMMON: call.uni (retval0), +; COMMON-NEXT: test_callee, +; COMMON: ); +; COMMON-NEXT: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; COMMON-NEXT: } +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a) ret <2 x i16> %r } -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_param_1]; -; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_select( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_param_1]; +; COMMON-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] +; COMMON-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; COMMON-NEXT: selp.b32 [[R:%r[0-9]+]], [[A]], [[B]], [[PRED]]; +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { %r = select i1 %c, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_param_3]; -; CHECK-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; CHECK-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_select_cc( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_param_1]; +; COMMON-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_param_2]; +; COMMON-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_param_3]; +; COMMON-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] +; COMMON-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] +; COMMON-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; COMMON-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; COMMON-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; COMMON-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; +; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; +; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 { %cc = icmp ne <2 x i16> %c, %d %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b ret <2 x i16> %r } -; CHECK-LABEL: test_select_cc_i32_i16( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_0]; -; CHECK-DAG: ld.param.v2.u32 {[[B0:%r[0-9]+]], [[B1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_1]; -; CHECK-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_i32_i16_param_2]; -; CHECK-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_i32_i16_param_3]; -; CHECK-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] -; CHECK-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] -; CHECK-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: selp.b32 [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b32 [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_select_cc_i32_i16( +; COMMON-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_0]; +; COMMON-DAG: ld.param.v2.u32 {[[B0:%r[0-9]+]], [[B1:%r[0-9]+]]}, [test_select_cc_i32_i16_param_1]; +; COMMON-DAG: ld.param.u32 [[C:%r[0-9]+]], [test_select_cc_i32_i16_param_2]; +; COMMON-DAG: ld.param.u32 [[D:%r[0-9]+]], [test_select_cc_i32_i16_param_3]; +; COMMON-DAG: mov.b32 {[[C0:%rs[0-9]+]], [[C1:%rs[0-9]+]]}, [[C]] +; COMMON-DAG: mov.b32 {[[D0:%rs[0-9]+]], [[D1:%rs[0-9]+]]}, [[D]] +; COMMON-DAG: setp.ne.s16 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; COMMON-DAG: setp.ne.s16 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; COMMON-DAG: selp.b32 [[R0:%r[0-9]+]], [[A0]], [[B0]], [[P0]]; +; COMMON-DAG: selp.b32 [[R1:%r[0-9]+]], [[A1]], [[B1]], [[P1]]; +; COMMON-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON-NEXT: ret; define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, <2 x i16> %c, <2 x i16> %d) #0 { %cc = icmp ne <2 x i16> %c, %d @@ -409,20 +409,20 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ret <2 x i32> %r } -; CHECK-LABEL: test_select_cc_i16_i32( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_i16_i32_param_0]; -; CHECK-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_i16_i32_param_1]; -; CHECK-DAG: ld.param.v2.u32 {[[C0:%r[0-9]+]], [[C1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_2]; -; CHECK-DAG: ld.param.v2.u32 {[[D0:%r[0-9]+]], [[D1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_3]; -; CHECK-DAG: setp.ne.s32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.ne.s32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; +; COMMON-LABEL: test_select_cc_i16_i32( +; COMMON-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_select_cc_i16_i32_param_0]; +; COMMON-DAG: ld.param.u32 [[B:%r[0-9]+]], [test_select_cc_i16_i32_param_1]; +; COMMON-DAG: ld.param.v2.u32 {[[C0:%r[0-9]+]], [[C1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_2]; +; COMMON-DAG: ld.param.v2.u32 {[[D0:%r[0-9]+]], [[D1:%r[0-9]+]]}, [test_select_cc_i16_i32_param_3]; +; COMMON-DAG: setp.ne.s32 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; COMMON-DAG: setp.ne.s32 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; COMMON-DAG: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; COMMON-DAG: mov.b32 {[[B0:%rs[0-9]+]], [[B1:%rs[0-9]+]]}, [[B]] +; COMMON-DAG: selp.b16 [[R0:%rs[0-9]+]], [[A0]], [[B0]], [[P0]]; +; COMMON-DAG: selp.b16 [[R1:%rs[0-9]+]], [[A1]], [[B1]], [[P1]]; +; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; COMMON-NEXT: ret; define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, <2 x i32> %c, <2 x i32> %d) #0 { %cc = icmp ne <2 x i32> %c, %d @@ -431,90 +431,90 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, } -; CHECK-LABEL: test_trunc_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_trunc_2xi32_param_0]; -; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_trunc_2xi32( +; COMMON: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_trunc_2xi32_param_0]; +; COMMON-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A0]]; +; COMMON-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[A1]]; +; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { %r = trunc <2 x i32> %a to <2 x i16> ret <2 x i16> %r } -; CHECK-LABEL: test_trunc_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_trunc_2xi64_param_0]; -; CHECK-DAG: cvt.u16.u64 [[R0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.u16.u64 [[R1:%rs[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_trunc_2xi64( +; COMMON: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_trunc_2xi64_param_0]; +; COMMON-DAG: cvt.u16.u64 [[R0:%rs[0-9]+]], [[A0]]; +; COMMON-DAG: cvt.u16.u64 [[R1:%rs[0-9]+]], [[A1]]; +; COMMON: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { %r = trunc <2 x i64> %a to <2 x i16> ret <2 x i16> %r } -; CHECK-LABEL: test_zext_2xi32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi32_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; +; COMMON-LABEL: test_zext_2xi32( +; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi32_param_0]; +; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; COMMON-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[A0]]; +; COMMON-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[A1]]; +; COMMON-NEXT: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON: ret; define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { %r = zext <2 x i16> %a to <2 x i32> ret <2 x i32> %r } -; CHECK-LABEL: test_zext_2xi64( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi64_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.u64.u16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.u64.u16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; +; COMMON-LABEL: test_zext_2xi64( +; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_zext_2xi64_param_0]; +; COMMON: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; COMMON-DAG: cvt.u64.u16 [[R0:%rd[0-9]+]], [[A0]]; +; COMMON-DAG: cvt.u64.u16 [[R1:%rd[0-9]+]], [[A1]]; +; COMMON-NEXT: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}; +; COMMON: ret; define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { %r = zext <2 x i16> %a to <2 x i64> ret <2 x i64> %r } -; CHECK-LABEL: test_bitcast_i32_to_2xi16( -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_bitcast_i32_to_2xi16( +; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_i32_to_2xi16_param_0]; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { %r = bitcast i32 %a to <2 x i16> ret <2 x i16> %r } -; CHECK-LABEL: test_bitcast_2xi16_to_i32( -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; COMMON-LABEL: test_bitcast_2xi16_to_i32( +; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_bitcast_2xi16_to_i32_param_0]; +; COMMON: st.param.b32 [func_retval0+0], [[R]]; +; COMMON: ret; define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { %r = bitcast <2 x i16> %a to i32 ret i32 %r } -; CHECK-LABEL: test_shufflevector( -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_shufflevector_param_0]; -; CHECK: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]]; -; CHECK: mov.b32 [[R1:%r[0-9]+]], {[[RS1]], [[RS0]]}; -; CHECK: st.param.b32 [func_retval0+0], [[R1]]; -; CHECK: ret; +; COMMON-LABEL: test_shufflevector( +; COMMON: ld.param.u32 [[R:%r[0-9]+]], [test_shufflevector_param_0]; +; COMMON: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[R]]; +; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[RS1]], [[RS0]]}; +; COMMON: st.param.b32 [func_retval0+0], [[R1]]; +; COMMON: ret; define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> ret <2 x i16> %s } -; CHECK-LABEL: test_insertelement( -; CHECK: ld.param.u16 [[B:%rs[0-9]+]], [test_insertelement_param_1]; -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_insertelement_param_0]; -; CHECK: { .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; } -; CHECK: mov.b32 [[R1:%r[0-9]+]], {[[R0]], [[B]]}; -; CHECK: st.param.b32 [func_retval0+0], [[R1]]; -; CHECK: ret; +; COMMON-LABEL: test_insertelement( +; COMMON: ld.param.u16 [[B:%rs[0-9]+]], [test_insertelement_param_1]; +; COMMON: ld.param.u32 [[A:%r[0-9]+]], [test_insertelement_param_0]; +; COMMON: { .reg .b16 tmp; mov.b32 {[[R0:%rs[0-9]+]], tmp}, [[A]]; } +; COMMON: mov.b32 [[R1:%r[0-9]+]], {[[R0]], [[B]]}; +; COMMON: st.param.b32 [func_retval0+0], [[R1]]; +; COMMON: ret; define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { %i = insertelement <2 x i16> %a, i16 %x, i64 1 ret <2 x i16> %i From ceb29e3970550b0e9b9a324977522825c6bba7aa Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 14:23:29 -0700 Subject: [PATCH 04/11] More cleanup --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 72c1b82f0fe4e..ce3f2d116d454 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -612,10 +612,10 @@ bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { SDValue Vector = N->getOperand(0); - // We only care about f16x2 as it's the only real vector type we + // We only care about 16x2 as it's the only real vector type we // need to deal with. MVT VT = Vector.getSimpleValueType(); - if (!(VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16)) + if (!Isv2x16VT(VT)) return false; // Find and record all uses of this vector that extract element 0 or 1. SmallVector E0, E1; @@ -910,9 +910,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { // Vector Setting unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; if (SimpleVT.isVector()) { - assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16 || - LoadedVT == MVT::v2i16) && - "Unexpected vector type"); + assert(Isv2x16VT(LoadedVT) && "Unexpected vector type"); // v2f16/v2bf16/v2i16 is loaded using ld.b32 fromTypeWidth = 32; } @@ -1063,10 +1061,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { EVT EltVT = N->getValueType(0); - // v8f16 is a special case. PTX doesn't have ld.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and + // v8x16 is a special case. PTX doesn't have ld.v8.16 + // instruction. Instead, we split the vector into v2x16 chunks and // load them with ld.v4.b32. - if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16 || EltVT == MVT::v2i16) { + if (Isv2x16VT(EltVT)) { assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); EltVT = MVT::i32; FromType = NVPTX::PTXLdStInstCode::Untyped; @@ -1846,10 +1844,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return false; } - // v8f16 is a special case. PTX doesn't have st.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and + // v8x16 is a special case. PTX doesn't have st.v8.x16 + // instruction. Instead, we split the vector into v2x16 chunks and // store them with st.v4.b32. - if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16 || EltVT == MVT::v2i16) { + if (Isv2x16VT(EltVT)) { assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); EltVT = MVT::i32; ToType = NVPTX::PTXLdStInstCode::Untyped; From 6b74af4e568ea8bc8d3ec4fd91164a80eb51be3b Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 14:55:56 -0700 Subject: [PATCH 05/11] use pattern instead of adding a next inst --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 3d5e84c2298ca..4b7e2a669eb96 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -747,19 +747,10 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; // def v2f16imm : Operand; // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; -def SELP_f16x2rr : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p), - "selp.b32 \t$dst, $a, $b, $p;", - [(set Int32Regs:$dst, - (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b)))]>; - -def SELP_i16x2rr : - NVPTXInst<(outs Int32Regs:$dst), - (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p), - "selp.b32 \t$dst, $a, $b, $p;", - [(set Int32Regs:$dst, - (select Int1Regs:$p, (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b)))]>; +def : Pat<(v2f16 (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b))), + (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; +def : Pat<(v2i16 (select Int1Regs:$p, (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b))), + (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; //----------------------------------- // Test Instructions From 5ff5f8c8bd1e61d15724fb03fc7718d21204cd5b Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 15:02:24 -0700 Subject: [PATCH 06/11] missing v2bf16 case --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4b7e2a669eb96..f36fd3e2ca424 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -749,6 +749,8 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; def : Pat<(v2f16 (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b))), (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; +def : Pat<(v2bf16 (select Int1Regs:$p, (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b))), + (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; def : Pat<(v2i16 (select Int1Regs:$p, (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b))), (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; From db22d565f8935882e28e4d203e4fc4a6ea95d031 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 15:03:42 -0700 Subject: [PATCH 07/11] Fix comment --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ce3f2d116d454..4f3ee26990cef 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1260,7 +1260,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { if (EltVT.isVector()) { NumElts = EltVT.getVectorNumElements(); EltVT = EltVT.getVectorElementType(); - // vectors of f16 are loaded/stored as multiples of v2f16 elements. + // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16) || (EltVT == MVT::i16 && N->getValueType(0) == MVT::v2i16)) { From 8bf200fa2861614dc46ec49c45949003fa227e77 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 15:14:28 -0700 Subject: [PATCH 08/11] addressing more review comments --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 28 +++++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2be7aa2f2efbf..aebd60c5cc8b9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -206,12 +206,19 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, // us as an array of v2f16/v2bf16 elements. We must match this so we // stay in sync with Ins/Outs. if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { - if (EltVT == MVT::f16) + switch (EltVT.getSimpleVT().SimpleTy) { + case MVT::f16: EltVT = MVT::v2f16; - else if (EltVT == MVT::bf16) + break; + case MVT::bf16: EltVT = MVT::v2bf16; - else if (EltVT == MVT::i16) + break; + case MVT::i16: EltVT = MVT::v2i16; + break; + default: + llvm_unreachable("Unexpected type"); + } NumElts /= 2; } for (unsigned j = 0; j != NumElts; ++j) { @@ -627,7 +634,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); - setI16x2OperationAction(ISD::AND, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); @@ -2477,7 +2483,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UMAX: case ISD::ADD: case ISD::SUB: - case ISD::AND: case ISD::MUL: case ISD::SHL: case ISD::SREM: @@ -5353,12 +5358,19 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, Load16x2 = true; Opcode = NVPTXISD::LoadV4; EVT VVT; - if (EltVT == MVT::f16) + switch (EltVT.getSimpleVT().SimpleTy) { + case MVT::f16: VVT = MVT::v2f16; - else if (EltVT == MVT::bf16) + break; + case MVT::bf16: VVT = MVT::v2bf16; - else if (EltVT == MVT::i16) + break; + case MVT::i16: VVT = MVT::v2i16; + break; + default: + llvm_unreachable("Unsupported v8 vector type."); + } EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; LdResVTs = DAG.getVTList(ListVTs); break; From d827317769a43b0c5d540ec2e8d1ca2b663fede9 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 15:27:09 -0700 Subject: [PATCH 09/11] add missng bf16 case and fix indentation --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 +++-- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index aebd60c5cc8b9..3db9edea0dcbc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5300,8 +5300,9 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, case MVT::v4i32: case MVT::v4f16: case MVT::v4f32: - case MVT::v8f16: // <4 x f16x2> - case MVT::v8i16: // <4 x i16x2> + case MVT::v8f16: // <4 x f16x2> + case MVT::v8bf16: // <4 x bf16x2> + case MVT::v8i16: // <4 x i16x2> // This is a "native" vector type break; } diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 5f27594ef29d7..ccd645acf9d3e 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -69,11 +69,11 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; ; I16x2-NEXT: add.s16x2 [[R:%r[0-9]+]], [[A]], [[B]]; ; -; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; -; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; -; NO-I16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; -; NO-I16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; -; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; +; NO-I16x2-DAG: mov.b32 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [[A]]; +; NO-I16x2-DAG: mov.b32 {[[RS2:%rs[0-9]+]], [[RS3:%rs[0-9]+]]}, [[B]]; +; NO-I16x2-DAG: add.s16 [[RS4:%rs[0-9]+]], [[RS0]], [[RS2]]; +; NO-I16x2-DAG: add.s16 [[RS5:%rs[0-9]+]], [[RS1]], [[RS3]]; +; NO-I16x2-DAG: mov.b32 [[R:%r[0-9]+]], {[[RS4]], [[RS5]]}; ; ; COMMON-NEXT: st.param.b32 [func_retval0+0], [[R]]; ; COMMON-NEXT: ret; From a9295ae502483dfabf2b96e8de4c2e67bd0a1752 Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 15:34:22 -0700 Subject: [PATCH 10/11] use a loop for the sel pattern --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index f36fd3e2ca424..d9cec25097fd7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -747,12 +747,10 @@ defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>; // def v2f16imm : Operand; // defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Int32Regs, v2f16imm, imm>; -def : Pat<(v2f16 (select Int1Regs:$p, (v2f16 Int32Regs:$a), (v2f16 Int32Regs:$b))), - (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; -def : Pat<(v2bf16 (select Int1Regs:$p, (v2bf16 Int32Regs:$a), (v2bf16 Int32Regs:$b))), - (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; -def : Pat<(v2i16 (select Int1Regs:$p, (v2i16 Int32Regs:$a), (v2i16 Int32Regs:$b))), +foreach vt = [v2f16, v2bf16, v2i16] in { +def : Pat<(vt (select Int1Regs:$p, (vt Int32Regs:$a), (vt Int32Regs:$b))), (SELP_b32rr Int32Regs:$a, Int32Regs:$b, Int1Regs:$p)>; +} //----------------------------------- // Test Instructions From 59d2c1eb7897fba580adf0ffe28f229154cf1c4e Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Wed, 6 Sep 2023 16:06:09 -0700 Subject: [PATCH 11/11] Use range-base transformation instead do loop and coalesce extract patterns. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 11 +++++------ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 16 ++++------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3db9edea0dcbc..89668611ac9ce 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2427,12 +2427,11 @@ static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { SmallVector VecElements; for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { SmallVector ScalarArgs; - for (int J = 0, NumOp = Op.getNumOperands(); J < NumOp; J++) { - SDValue Ext = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op->getOperand(J), - DAG.getIntPtrConstant(I, DL)); - ScalarArgs.push_back(Ext); - } + llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), + [&](const SDUse &O) { + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + O.get(), DAG.getIntPtrConstant(I, DL)); + }); VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); } SDValue V = diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index d9cec25097fd7..daed977dc18ae 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3305,24 +3305,16 @@ def : Pat<(i32 (trunc (srl Int64Regs:$s, (i32 32)))), def : Pat<(i32 (trunc (sra Int64Regs:$s, (i32 32)))), (I64toI32H Int64Regs:$s)>; -def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 0)), +foreach vt = [v2f16, v2bf16, v2i16] in { +def : Pat<(extractelt (vt Int32Regs:$src), 0), (I32toI16L Int32Regs:$src)>; -def : Pat<(f16 (extractelt (v2f16 Int32Regs:$src), 1)), +def : Pat<(extractelt (vt Int32Regs:$src), 1), (I32toI16H Int32Regs:$src)>; +} def : Pat<(v2f16 (build_vector (f16 Int16Regs:$a), (f16 Int16Regs:$b))), (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; - -def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 0)), - (I32toI16L Int32Regs:$src)>; -def : Pat<(bf16 (extractelt (v2bf16 Int32Regs:$src), 1)), - (I32toI16H Int32Regs:$src)>; def : Pat<(v2bf16 (build_vector (bf16 Int16Regs:$a), (bf16 Int16Regs:$b))), (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>; - -def : Pat<(i16 (extractelt (v2i16 Int32Regs:$src), 0)), - (I32toI16L Int32Regs:$src)>; -def : Pat<(i16 (extractelt (v2i16 Int32Regs:$src), 1)), - (I32toI16H Int32Regs:$src)>; def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))), (V2I16toI32 Int16Regs:$a, Int16Regs:$b)>;