diff --git a/llvm/docs/TableGen/BackGuide.rst b/llvm/docs/TableGen/BackGuide.rst index e1413c1c73a79..60677a6dcd627 100644 --- a/llvm/docs/TableGen/BackGuide.rst +++ b/llvm/docs/TableGen/BackGuide.rst @@ -761,7 +761,7 @@ over time. The output looks like this. -------------------- Global Variables (5) -------------------- - AMDGPUBufferIntrinsics = [int_amdgcn_buffer_load_format, ... + AMDGPUBufferIntrinsics = [int_amdgcn_s_buffer_load, ... AMDGPUImageDimAtomicIntrinsics = [int_amdgcn_image_atomic_swap_1d, ... ... -------------------- Classes (758) -------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d4a8954a4cdac..be4a1b8b92767 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1072,18 +1072,6 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < defset list AMDGPUBufferIntrinsics = { -class AMDGPUBufferLoad : DefaultAttrsIntrinsic < - [data_ty], - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrReadMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<0>; -def int_amdgcn_buffer_load_format : AMDGPUBufferLoad; -def int_amdgcn_buffer_load : AMDGPUBufferLoad; - // Generate a buffer_load instruction that may be optimized to s_buffer_load if // the offset argument is uniform. def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < @@ -1100,25 +1088,12 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < [IntrNoMem, ImmArg>]>, AMDGPURsrcIntrinsic<0>; -class AMDGPUBufferStore : DefaultAttrsIntrinsic < - [], - [data_ty, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrWriteMem, ImmArg>, ImmArg>], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1>; -def int_amdgcn_buffer_store_format : AMDGPUBufferStore; -def int_amdgcn_buffer_store : AMDGPUBufferStore; - -// New buffer intrinsics with separate raw and struct variants. The raw +// Buffer intrinsics with separate raw and struct variants. The raw // variant never has an index. The struct variant always has an index, even if // it is const 0. A struct intrinsic with constant 0 index is different to the // corresponding raw intrinsic on gfx9+ because the behavior of bound checking // and swizzling changes depending on whether idxen is set in the instruction. -// These new instrinsics also keep the offset and soffset arguments separate as +// These instrinsics also keep the offset and soffset arguments separate as // they behave differently in bounds checking and swizzling. // The versions of these intrinsics that take <4 x i32> arguments are deprecated @@ -1478,41 +1453,7 @@ def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic; -// Obsolescent tbuffer intrinsics. -def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < - [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - [llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrReadMem, - ImmArg>, ImmArg>, ImmArg>, - ImmArg>, ImmArg>], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<0>; - -def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < - [], - [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // voffset(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) - [IntrWriteMem, ImmArg>, - ImmArg>, ImmArg>, - ImmArg>, ImmArg>], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1>; - -// New tbuffer intrinsics, with: +// tbuffer intrinsics, with: // - raw and struct variants // - joint format field // - joint cachepolicy field @@ -1659,51 +1600,6 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < ImmArg>, ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; -class AMDGPUBufferAtomic : Intrinsic < - [llvm_anyint_ty], - [LLVMMatchType<0>, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1, 0>; -def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; -def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< - [llvm_i32_ty], - [llvm_i32_ty, // src(VGPR) - llvm_i32_ty, // cmp(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<2, 0>; - -def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; - -class AMDGPUBufferAtomicFP : Intrinsic < - [llvm_anyfloat_ty], - [LLVMMatchType<0>, // vdata(VGPR) - llvm_v4i32_ty, // rsrc(SGPR) - llvm_i32_ty, // vindex(VGPR) - llvm_i32_ty, // offset(SGPR/VGPR/imm) - llvm_i1_ty], // slc(imm) - [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, - AMDGPURsrcIntrinsic<1, 0>; - -// Legacy form of the intrinsic. raw and struct forms should be preferred. -def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; - class AMDGPURawBufferLoadLDS : Intrinsic < [], [llvm_v4i32_ty, // rsrc(SGPR) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1d645002b1fe6..38cc5a9bef969 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { default: return; - case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: Op = AtomicRMWInst::Add; break; - case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: Op = AtomicRMWInst::Sub; break; - case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: Op = AtomicRMWInst::And; break; - case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: case Intrinsic::amdgcn_raw_buffer_atomic_or: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: Op = AtomicRMWInst::Or; break; - case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_raw_buffer_atomic_xor: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: Op = AtomicRMWInst::Xor; break; - case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_raw_buffer_atomic_smin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: Op = AtomicRMWInst::Min; break; - case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_raw_buffer_atomic_umin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: Op = AtomicRMWInst::UMin; break; - case Intrinsic::amdgcn_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_raw_buffer_atomic_smax: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: Op = AtomicRMWInst::Max; break; - case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_umax: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 160a17584ca3a..93bca4402ed23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - case Intrinsic::amdgcn_buffer_store_format: case Intrinsic::amdgcn_raw_buffer_store_format: case Intrinsic::amdgcn_struct_buffer_store_format: case Intrinsic::amdgcn_raw_tbuffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: - case Intrinsic::amdgcn_tbuffer_store: case Intrinsic::amdgcn_image_store_1d: case Intrinsic::amdgcn_image_store_1darray: case Intrinsic::amdgcn_image_store_2d: @@ -1376,8 +1374,6 @@ std::optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( std::function SimplifyAndSetOp) const { switch (II.getIntrinsicID()) { - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: @@ -1391,7 +1387,6 @@ std::optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( case Intrinsic::amdgcn_struct_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: case Intrinsic::amdgcn_struct_ptr_tbuffer_load: - case Intrinsic::amdgcn_tbuffer_load: return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 410dc83d45c57..e84d39a2895c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -256,17 +256,6 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -339,7 +328,6 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f9948e92862f7..5bb29b1f018f6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1280,19 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_buffer_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; - Info.align.reset(); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - const ConstantInt *Vol = dyn_cast(CI.getOperand(4)); - if (!Vol || !Vol->isZero()) - Info.flags |= MachineMemOperand::MOVolatile; - - return true; - } case Intrinsic::amdgcn_ds_add_gs_reg_rtn: case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -8602,12 +8589,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, M->getMemOperand()); } -// Return a value to use for the idxen operand by examining the vindex operand. -static unsigned getIdxEn(SDValue VIndex) { - // No need to set idxen if vindex is known to be zero. - return isNullConstant(VIndex) ? 0 : 1; -} - SDValue SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -8732,43 +8713,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_buffer_load: - case Intrinsic::amdgcn_buffer_load_format: { - unsigned Glc = Op.getConstantOperandVal(5); - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); - - unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, - M->getMemOperand()); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); - } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: @@ -8818,35 +8762,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerIntrinsicLoad(cast(Op), IsFormat, DAG, Ops); } - case Intrinsic::amdgcn_tbuffer_load: { - MemSDNode *M = cast(Op); - EVT LoadVT = Op.getValueType(); - - auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget); - unsigned Dfmt = Op.getConstantOperandVal(7); - unsigned Nfmt = Op.getConstantOperandVal(8); - unsigned Glc = Op.getConstantOperandVal(9); - unsigned Slc = Op.getConstantOperandVal(10); - unsigned IdxEn = getIdxEn(Op.getOperand(3)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - SOffset, // soffset - Op.getOperand(6), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen - }; - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, M->getMemOperand(), - DAG); - } case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { MemSDNode *M = cast(Op); @@ -8901,82 +8816,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, LoadVT, M->getMemOperand(), DAG); } - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_csub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = Op.getConstantOperandVal(6); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - EVT VT = Op.getValueType(); - - auto *M = cast(Op); - unsigned Opcode = 0; - - switch (IntrID) { - case Intrinsic::amdgcn_buffer_atomic_swap: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; - break; - case Intrinsic::amdgcn_buffer_atomic_add: - Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; - break; - case Intrinsic::amdgcn_buffer_atomic_sub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; - break; - case Intrinsic::amdgcn_buffer_atomic_csub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; - break; - case Intrinsic::amdgcn_buffer_atomic_smin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_umin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; - break; - case Intrinsic::amdgcn_buffer_atomic_smax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_umax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; - break; - case Intrinsic::amdgcn_buffer_atomic_and: - Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; - break; - case Intrinsic::amdgcn_buffer_atomic_or: - Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; - break; - case Intrinsic::amdgcn_buffer_atomic_xor: - Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; - break; - case Intrinsic::amdgcn_buffer_atomic_fadd: - Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; - break; - default: - llvm_unreachable("unhandled atomic opcode"); - } - - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); @@ -9085,29 +8924,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); - case Intrinsic::amdgcn_buffer_atomic_cmpswap: { - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(5)); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); - - EVT VT = Op.getValueType(); - auto *M = cast(Op); - - return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); - } case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); @@ -9550,34 +9366,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(); }; - case Intrinsic::amdgcn_tbuffer_store: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Dfmt = Op.getConstantOperandVal(8); - unsigned Nfmt = Op.getConstantOperandVal(9); - unsigned Glc = Op.getConstantOperandVal(10); - unsigned Slc = Op.getConstantOperandVal(11); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Op.getOperand(5), // voffset - Op.getOperand(6), // soffset - Op.getOperand(7), // offset - DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : - AMDGPUISD::TBUFFER_STORE_FORMAT; - MemSDNode *M = cast(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } case Intrinsic::amdgcn_struct_tbuffer_store: case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { @@ -9635,42 +9423,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_buffer_store: - case Intrinsic::amdgcn_buffer_store_format: { - SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - unsigned Glc = Op.getConstantOperandVal(6); - unsigned Slc = Op.getConstantOperandVal(7); - unsigned IdxEn = getIdxEn(Op.getOperand(4)); - SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; - Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; - MemSDNode *M = cast(Op); - - // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } - case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: @@ -10076,8 +9828,8 @@ std::pair SITargetLowering::splitBufferOffsets( return {N0, SDValue(C1, 0)}; } -// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the -// three offsets (voffset, soffset and instoffset) into the SDValue[3] array +// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store +// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 292b17da93583..61bffe9aa80e5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -255,9 +255,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool shouldExpandVectorDynExt(SDNode *N) const; private: - // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the - // three offsets (voffset, soffset and instoffset) into the SDValue[3] array - // pointed to by Offsets. + // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store + // the three offsets (voffset, soffset and instoffset) into the SDValue[3] + // array pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment = Align(4)) const; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll index 3db2183845ff2..6d0ca701e47a9 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,93 +1,5 @@ ; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print' -disable-output %s 2>&1 | FileCheck %s -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32( -define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32( -define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32( -define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32( -define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32( -define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32( -define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32( -define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32( -define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32( -define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32( -define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - -;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap( -define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 { -main_body: - %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %r = bitcast i32 %orig to float - ret float %r -} - ;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32( define float @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { main_body: @@ -440,18 +352,6 @@ main_body: ret float %r } -declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 - declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll index f4cd19a2ffa80..87461ec77446d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -21,75 +21,37 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) - ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80 @@ -107,6 +69,7 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88 ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) @@ -123,94 +86,95 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET8]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112 ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224 ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120 ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240 ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8) @@ -268,113 +232,63 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288 ; GCN-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY67:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152 ; GCN-NEXT: [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304 ; GCN-NEXT: [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8) ; GCN-NEXT: [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY75:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: [[COPY76:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8) ; GCN-NEXT: S_ENDPGM 0 bb.0: %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0 %tmp1 = load ptr addrspace(8), ptr addrspace(6) %arg0, align 16, !invariant.load !0 - %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0 - %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 - %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0 - %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0 - - ; Insert inline asm to keep the different instruction types from being mixed. This makes the output easier to read. - call void asm sideeffect "", "" () - - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, <4 x i32> %tmp0, i32 0, i32 32, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, <4 x i32> %tmp0, i32 1, i32 32, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, <4 x i32> %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1 - - call void asm sideeffect "", "" () - - %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 48, i1 false, i1 false) #0 - %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0 - %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 48, i1 false, i1 false) #0 - %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0 - - call void asm sideeffect "", "" () - - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, <4 x i32> %tmp0, i32 0, i32 64, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, <4 x i32> %tmp0, i32 1, i32 64, i1 false, i1 false) #1 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1 - - call void asm sideeffect "", "" () - - %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 80, i1 false) #2 - %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 80, i1 false) #2 - %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 80, i1 false) #2 - - call void asm sideeffect "", "" () - - %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 96, i1 false) #2 - %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 96, i1 false) #2 - %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 96, i1 false) #2 - - call void asm sideeffect "", "" () - - %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2 - %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2 - %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2 - %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2 - - call void asm sideeffect "", "" () - ; rsrc, offset, soffset, cachepolicy %raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 0, i32 0) #0 %raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 64, i32 64, i32 0) #0 %raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 128, i32 0) #0 @@ -383,7 +297,6 @@ bb.0: call void asm sideeffect "", "" () - ; rsrc, offset, soffset, cachepolicy %raw_ptr_buffer0 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 128, i32 0, i32 0) #3 %raw_ptr_buffer1 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 64, i32 64, i32 0) #3 %raw_ptr_buffer2 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 0, i32 128, i32 0) #3 @@ -472,7 +385,6 @@ bb.0: call void asm sideeffect "", "" () - ; rsrc, vindex, offset, soffset, cachepolicy %struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 0, i32 0) #0 %struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 112, i32 112, i32 0) #0 %struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 224, i32 0) #0 @@ -592,13 +504,6 @@ bb.0: ret void } -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2 -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2 @@ -629,7 +534,6 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float>, ptr attributes #0 = { nounwind readonly } -attributes #1 = { nounwind writeonly } attributes #2 = { nounwind } attributes #3 = { nounwind memory(argmem: read) } attributes #4 = { nounwind memory(argmem: write) } diff --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll index bdc73e5b99720..727863936096a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll @@ -4,46 +4,6 @@ ; The buffer_loads and buffer_stores all access the same location. Check they do ; not get reordered by the scheduler. -; GCN-LABEL: {{^}}_amdgpu_cs_main: -; GCN: buffer_load_dword -; GCN: buffer_store_dword -; GCN: buffer_load_dword -; GCN: buffer_store_dword -; GCN: buffer_load_dword -; GCN: buffer_store_dword -; GCN: buffer_load_dword -; GCN: buffer_store_dword - -; Function Attrs: nounwind -define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg3, <3 x i32> %arg5) { -.entry: - %tmp9 = add <3 x i32> %arg3, %arg5 - %tmp10 = extractelement <3 x i32> %tmp9, i32 0 - %tmp11 = shl i32 %tmp10, 2 - %tmp12 = inttoptr i64 undef to ptr addrspace(4) - %tmp13 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp14 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp17 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - call void @llvm.amdgcn.buffer.store.f32(float %tmp14, <4 x i32> %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp20 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp21 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp22 = fadd reassoc nnan arcp contract float %tmp21, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp22, <4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp25 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp26 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp27 = fadd reassoc nnan arcp contract float %tmp26, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp27, <4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp30 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp31 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp32 = fadd reassoc nnan arcp contract float %tmp31, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp32, <4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp35 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16 - %tmp36 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 - %tmp37 = fadd reassoc nnan arcp contract float %tmp36, 1.000000e+00 - call void @llvm.amdgcn.buffer.store.f32(float %tmp37, <4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0 - ret void -} - ; GCN-LABEL: {{^}}test1: ; GCN: buffer_store_dword ; GCN: buffer_load_dword @@ -84,10 +44,6 @@ define amdgpu_cs void @test1_ptrs_reorderable(ptr addrspace(8) inreg %buf, i32 % } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 - -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3 - declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #3 diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll deleted file mode 100644 index 8ff78aaccf5a3..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=tahiti -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=hawaii -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s - -; Make sure selection of these intrinsics fails on targets that do not -; have the instruction available. -; FIXME: Should also really make sure the v2f16 version fails. - -; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD -define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) { - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false) - ret void -} - -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0 - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll deleted file mode 100644 index b35de03203004..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll +++ /dev/null @@ -1,141 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx941 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -mattr=-forcestoresc1 < %s | FileCheck --check-prefixes=GCN,NOSC0SC1 %s - -define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) { -; FORCESC0SC1-LABEL: store_global: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_global: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(1) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) { -; FORCESC0SC1-LABEL: store_flat: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; FORCESC0SC1-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_flat: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v2, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; NOSC0SC1-NEXT: flat_store_dword v[0:1], v2 -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(0) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) { -; GCN-LABEL: store_lds: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: ds_write_b32 v1, v0 -; GCN-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(3) %ptr, align 4 - ret void -} - -define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) { -; FORCESC0SC1-LABEL: store_scratch: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_scratch: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 1.0 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: scratch_store_dword off, v0, s0 -; NOSC0SC1-NEXT: s_endpgm -entry: - store float 1.000000e+00, ptr addrspace(5) %ptr, align 4 - ret void -} - -define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) { -; FORCESC0SC1-LABEL: store_buffer: -; FORCESC0SC1: ; %bb.0: ; %main_body -; FORCESC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_buffer: -; NOSC0SC1: ; %bb.0: ; %main_body -; NOSC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen -; NOSC0SC1-NEXT: s_endpgm -main_body: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) { -; FORCESC0SC1-LABEL: store_global_atomic: -; FORCESC0SC1: ; %bb.0: ; %entry -; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; FORCESC0SC1-NEXT: buffer_wbl2 sc1 -; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; FORCESC0SC1-NEXT: s_endpgm -; -; NOSC0SC1-LABEL: store_global_atomic: -; NOSC0SC1: ; %bb.0: ; %entry -; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0 -; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0 -; NOSC0SC1-NEXT: buffer_wbl2 sc1 -; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0) -; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc1 -; NOSC0SC1-NEXT: s_endpgm -entry: - store atomic float 1.000000e+00, ptr addrspace(1) %ptr syncscope("agent-one-as") seq_cst, align 4 - ret void -} - -define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) { -; GCN-LABEL: store_global_atomic_system: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 -; GCN-NEXT: s_endpgm - store atomic float 1.000000e+00, ptr addrspace(1) %ptr monotonic, align 4 - ret void -} - - -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 121fab51024fd..0835b43e96a67 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -2,7 +2,6 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 -declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1) declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) @@ -24,89 +23,6 @@ declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { -; GFX90A-LABEL: buffer_atomic_add_noret_f64: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: buffer_atomic_add_noret_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, s8 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret void -} - -define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { -; GFX90A-LABEL: buffer_atomic_add_rtn_f64: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: buffer_atomic_add_rtn_f64: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1 -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - store double %ret, ptr undef - ret void -} - -define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { -; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: -; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] -; GFX90A-NEXT: s_endpgm -; -; GFX940-LABEL: buffer_atomic_add_rtn_f64_off4_slc: -; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c -; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v2, s10 -; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt -; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1 -; GFX940-NEXT: s_endpgm -main_body: - %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - store double %ret, ptr addrspace(1) %out, align 8 - ret void -} - define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body @@ -1186,7 +1102,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB42_3 +; GFX90A-NEXT: s_cbranch_execz .LBB39_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1198,7 +1114,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB42_2: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 @@ -1210,8 +1126,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_2 -; GFX90A-NEXT: .LBB42_3: +; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 +; GFX90A-NEXT: .LBB39_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: @@ -1221,7 +1137,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: s_cbranch_execz .LBB39_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1233,7 +1149,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB42_2: +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -1248,7 +1164,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB43_2 +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1259,7 +1175,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB43_2: +; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: @@ -1269,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB43_2 +; GFX940-NEXT: s_cbranch_execz .LBB40_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1281,7 +1197,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB43_2: +; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1296,7 +1212,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB44_3 +; GFX90A-NEXT: s_cbranch_execz .LBB41_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1308,7 +1224,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB44_2: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 @@ -1320,8 +1236,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_2 -; GFX90A-NEXT: .LBB44_3: +; GFX90A-NEXT: s_cbranch_execnz .LBB41_2 +; GFX90A-NEXT: .LBB41_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: @@ -1331,7 +1247,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB44_2 +; GFX940-NEXT: s_cbranch_execz .LBB41_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1343,7 +1259,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: .LBB44_2: +; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1358,7 +1274,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB45_2 +; GFX90A-NEXT: s_cbranch_execz .LBB42_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1369,7 +1285,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: .LBB45_2: +; GFX90A-NEXT: .LBB42_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: @@ -1379,7 +1295,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB45_2 +; GFX940-NEXT: s_cbranch_execz .LBB42_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1391,7 +1307,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB45_2: +; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1423,7 +1339,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] @@ -1436,7 +1352,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 @@ -1488,7 +1404,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] @@ -1501,7 +1417,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 @@ -1568,7 +1484,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB52_3 +; GFX90A-NEXT: s_cbranch_execz .LBB49_3 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] @@ -1580,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB52_2: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc @@ -1590,8 +1506,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_2 -; GFX90A-NEXT: .LBB52_3: +; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 +; GFX90A-NEXT: .LBB49_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: @@ -1601,7 +1517,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB52_2 +; GFX940-NEXT: s_cbranch_execz .LBB49_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] @@ -1613,7 +1529,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: .LBB52_2: +; GFX940-NEXT: .LBB49_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1629,7 +1545,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1642,7 +1558,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1700,7 +1616,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1714,7 +1630,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -1740,7 +1656,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] @@ -1753,7 +1669,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 @@ -1805,7 +1721,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] @@ -1819,7 +1735,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 { ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 @@ -1896,7 +1812,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 @@ -1907,7 +1823,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; @@ -2123,7 +2039,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB70_2 +; GFX90A-NEXT: s_cbranch_execz .LBB67_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2133,7 +2049,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB70_2: +; GFX90A-NEXT: .LBB67_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: @@ -2143,7 +2059,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB70_2 +; GFX940-NEXT: s_cbranch_execz .LBB67_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2153,7 +2069,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB70_2: +; GFX940-NEXT: .LBB67_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2168,7 +2084,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB71_2 +; GFX90A-NEXT: s_cbranch_execz .LBB68_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2178,7 +2094,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB71_2: +; GFX90A-NEXT: .LBB68_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: @@ -2188,7 +2104,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB71_2 +; GFX940-NEXT: s_cbranch_execz .LBB68_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2198,7 +2114,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB71_2: +; GFX940-NEXT: .LBB68_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2213,7 +2129,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB72_2 +; GFX90A-NEXT: s_cbranch_execz .LBB69_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2223,7 +2139,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: .LBB72_2: +; GFX90A-NEXT: .LBB69_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: @@ -2233,7 +2149,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX940-NEXT: s_cbranch_execz .LBB72_2 +; GFX940-NEXT: s_cbranch_execz .LBB69_2 ; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] @@ -2243,7 +2159,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: .LBB72_2: +; GFX940-NEXT: .LBB69_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll deleted file mode 100644 index 2b0584d39a3be..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS - -declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1) -declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) - -; GCN-LABEL: {{^}}buffer_atomic_csub_rtn: -; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc -; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN -define amdgpu_ps void @buffer_atomic_csub_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { -main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn: -; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen -; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen -define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { -main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_rtn: -; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc -; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN -define amdgpu_ps void @buffer_atomic_csub_off4_slc_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { -main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn: -; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc -; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT -define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { -main_body: - %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_csub_rtn: -; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN -define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) { -main_body: - %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_csub_no_rtn: -; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] -define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { -main_body: - %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn: -; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN -define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) { -main_body: - %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 - %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn: -; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4 -define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { -main_body: - %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 - %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data) - ret void -} - -attributes #0 = { "target-features"="+atomic-csub-no-rtn-insts" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll deleted file mode 100644 index 1e1bc2bbbd26a..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A -; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 - -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) - -; GFX908: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD - -; GFX90A-LABEL: {{^}}buffer_atomic_add_f32: -; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { -main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret float %ret -} - -; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: -; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc -define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { -main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret float %ret -} - -; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16: -; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: -; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc -define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc -define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret float %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} - -; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc -define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret <2 x half> %ret -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll deleted file mode 100644 index bd07dd137ac49..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN - -declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float) -declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>) -declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float) - -; GCN-LABEL: {{^}}buffer_atomic_add_f32: -; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { -main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: -; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc -define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { -main_body: - %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16: -; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: -; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc -define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) { -main_body: - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_off4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4 -define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) { -main_body: - %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1 - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: -; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}} -define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) { -main_body: - %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1 - %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data) - ret void -} - -; Make sure this artificially selects with an incorrect subtarget, but -; the feature set. -; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget: -; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} -define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 { - %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data) - ret void -} - -; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget: -; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 { - %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) - ret void -} - -attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts"} -attributes #1 = { "target-cpu"="gfx803" "target-features"="+flat-atomic-fadd-f32-inst"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll deleted file mode 100644 index eed648f167f39..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ /dev/null @@ -1,209 +0,0 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI - -;CHECK-LABEL: {{^}}test1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc -;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc -;SICI: v_mov_b32_e32 v1, 0x2000 -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc -;CHECK-DAG: s_waitcnt vmcnt(0) -;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc -;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} -define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { -main_body: - %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) - %ofs.5 = add i32 %voffset, 42 - %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) - %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %out = bitcast i32 %o6 to float - ret float %out -} - -;CHECK-LABEL: {{^}}test11: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0 glc -;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap_x2 v[3:4], v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap_x2 v[3:4], v[1:2], s[0:3], 0 idxen offen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen offset:42 glc -;CHECK-DAG: s_waitcnt vmcnt(0) -;SICI: buffer_atomic_swap_x2 v[3:4], v0, s[0:3], 0 offen glc -;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}} -define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { -main_body: - %o0 = sext i32 %data to i64 - %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) - %ofs.5 = add i32 %voffset, 42 - %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) - %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o7 = trunc i64 %o6 to i32 - %out = bitcast i32 %o7 to float - ret float %out -} - -;CHECK-LABEL: {{^}}test2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc -define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { -main_body: - %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %out = bitcast i32 %t9 to float - ret float %out -} - -;CHECK-LABEL: {{^}}test3: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_atomic_add_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_sub_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_and_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc -define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { -main_body: - %t0 = sext i32 %data to i64 - %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %t10 = trunc i64 %t9 to i32 - %out = bitcast i32 %t10 to float - ret float %out -} - -; Ideally, we would teach tablegen & friends that cmpswap only modifies the -; first vgpr. Since we don't do that yet, the register allocator will have to -; create copies which we don't bother to track here. -; -;CHECK-LABEL: {{^}}test4: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc -;CHECK: s_waitcnt vmcnt(0) -;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc -;CHECK-DAG: s_waitcnt vmcnt(0) -;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc -;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc -define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { -main_body: - %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) - %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) - %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) - %ofs.5 = add i32 %voffset, 44 - %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) - %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) - -; Detecting the no-return variant doesn't work right now because of how the -; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. -; Since there probably isn't a reasonable use-case of cmpswap that discards -; the return value, that seems okay. -; -; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) - %out = bitcast i32 %o6 to float - ret float %out -} - -;CHECK-LABEL: {{^}}test7: -;CHECK: buffer_atomic_add v0, -define amdgpu_ps float @test7() { -main_body: - %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false) - %v.float = bitcast i32 %v to float - ret float %v.float -} - -declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0 -declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0 - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll index fdbe6db8da14e..659842afad485 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll @@ -1,26 +1,6 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3 -;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3: -;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42 -;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) { -main_body: - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret <3 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs_x3: -;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 -;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt -define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) { -main_body: - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) - ret <3 x float> %data -} - ;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3: ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 @@ -81,8 +61,6 @@ main_body: ret <3 x float> %data } -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0 declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0 declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0 declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll deleted file mode 100644 index 3d67dfdd674d4..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s - -; GCN-LABEL: {{^}}buffer_load_format_d16_x: -; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 -define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { -main_body: - %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - ret half %data -} - -; GCN-LABEL: {{^}}buffer_load_format_d16_xy: -; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { -main_body: - %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - %elt = extractelement <2 x half> %data, i32 1 - ret half %elt -} - -; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: -; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) { -main_body: - %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - %elt = extractelement <3 x half> %data, i32 2 - ret half %elt -} - -; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: -; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { -main_body: - %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - %elt = extractelement <4 x half> %data, i32 3 - ret half %elt -} - -declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1) -declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1) -declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1) -declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll deleted file mode 100644 index 6851302fdcda3..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ /dev/null @@ -1,133 +0,0 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI - -;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 -;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc -;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) - %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 - %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 - %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 - ret {<4 x float>, <4 x float>, <4 x float>} %r2 -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038 -;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen -;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 -;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 -;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen -;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc -;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { -main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0) - %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0) - %d.3 = fadd <4 x float> %d.0, %d.1 - %data = fadd <4 x float> %d.2, %d.3 - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse: -;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc -;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68 -;VI-NOT: s_mov -;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84 -;VI: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { -main_body: - %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) - %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) - %data = fadd <4 x float> %d.0, %d.1 - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_idx: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_ofs: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { -main_body: - %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_both: -;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_both_reversed: -;CHECK: v_mov_b32_e32 v2, v0 -;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_x: -;CHECK: buffer_load_format_x v0, off, s[0:3], 0 -;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { -main_body: - %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - ret float %data -} - -;CHECK-LABEL: {{^}}buffer_load_xy: -;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 -;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { -main_body: - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - ret <2 x float> %data -} - -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 - -attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll deleted file mode 100644 index a209dcfe3a7a0..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ /dev/null @@ -1,476 +0,0 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI - -;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc -;CHECK: s_waitcnt -define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) - %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 - %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 - %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 - ret {<4 x float>, <4 x float>, <4 x float>} %r2 -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen -;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc -;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_idx: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_ofs: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { -main_body: - %ofs = add i32 %1, 60 - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_both: -;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_both_reversed: -;CHECK: v_mov_b32_e32 v2, v0 -;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { -main_body: - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) - ret <4 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_x1: -;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { -main_body: - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) - ret float %data -} - -;CHECK-LABEL: {{^}}buffer_load_x2: -;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt -define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { -main_body: - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) - ret <2 x float> %data -} - -;CHECK-LABEL: {{^}}buffer_load_negative_offset: -;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 -;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen -define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { -main_body: - %ofs.1 = add i32 %ofs, -16 - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) - ret <4 x float> %data -} - -; SI won't merge ds memory operations, because of the signed offset bug, so -; we only have check lines for VI. -; CHECK-LABEL: buffer_load_mmo: -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 -define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) { -entry: - store float 0.0, ptr addrspace(3) %lds - %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) - %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4 - store float 0.0, ptr addrspace(3) %tmp2 - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { -main_body: - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 8 - %a3 = add i32 %a, 12 - %a4 = add i32 %a, 16 - %a5 = add i32 %a, 28 - %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { -main_body: - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 8 - %a3 = add i32 %a, 12 - %a4 = add i32 %a, 16 - %a5 = add i32 %a, 28 - %a6 = add i32 %a, 32 - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { -main_body: - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r1 = extractelement <2 x float> %vr1, i32 0 - %r2 = extractelement <2 x float> %vr1, i32 1 - %r3 = extractelement <2 x float> %vr2, i32 0 - %r4 = extractelement <2 x float> %vr2, i32 1 - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: -;CHECK-NEXT: %bb. -;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { -main_body: - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 12 - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - %r1 = extractelement <2 x float> %vr1, i32 0 - %r2 = extractelement <2 x float> %vr1, i32 1 - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { -main_body: - %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) - %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { -main_body: - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - %r1 = extractelement <2 x float> %vr1, i32 0 - %r2 = extractelement <2 x float> %vr1, i32 1 - %r3 = extractelement <2 x float> %vr2, i32 0 - %r4 = extractelement <2 x float> %vr2, i32 1 - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: -;CHECK-NEXT: %bb. -;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK: s_waitcnt -define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { -main_body: - %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - %r1 = extractelement <2 x float> %vr1, i32 0 - %r2 = extractelement <2 x float> %vr1, i32 1 - call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true) - ret void -} - -;CHECK-LABEL: {{^}}buffer_load_ubyte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %val = uitofp i8 %tmp to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_ushort: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - %tmp2 = zext i16 %tmp to i32 - %val = uitofp i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sbyte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = sext i8 %tmp to i32 - %val = sitofp i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sshort: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - %tmp2 = sext i16 %tmp to i32 - %val = sitofp i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = zext i8 %tmp to i32 - %val = bitcast i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = zext i16 %tmp to i32 - %val = bitcast i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = sext i8 %tmp to i32 - %val = bitcast i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = sext i16 %tmp to i32 - %val = bitcast i32 %tmp2 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) - %tmp2 = zext i8 %tmp to i32 - %tmp3 = mul i32 %tmp2, 255 - %val = bitcast i32 %tmp3 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) - %tmp2 = zext i16 %tmp to i32 - %tmp3 = mul i32 %tmp2, 255 - %val = bitcast i32 %tmp3 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) - %tmp2 = sext i8 %tmp to i32 - %tmp3 = mul i32 %tmp2, 255 - %val = bitcast i32 %tmp3 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}} -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) { -main_body: - %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0) - %tmp2 = sext i16 %tmp to i32 - %tmp3 = mul i32 %tmp2, 255 - %val = bitcast i32 %tmp3 to float - ret float %val -} - -;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5 -;CHECK-NEXT: ; return to shader part epilog -define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) { -main_body: - %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - %tmp2 = zext i8 %tmp to i32 - %tmp3 = shl i32 %tmp2, 27 - %tmp4 = ashr i32 %tmp3, 27 - %val = bitcast i32 %tmp4 to float - ret float %val -} - -; Make sure a frame index folding doesn't crash on a MUBUF not used -; for stack access. - -; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset: -; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen -define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) { - %alloca = alloca i32, addrspace(5) - %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32 - - %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false) - ret float %ret.val -} - -; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset: -; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s -; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]] -define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) { - %alloca = alloca i32, addrspace(5) - %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32 - - %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false) - ret float %ret.val -} - -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0 -declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - -attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll index 269956f948f50..7723b565d962b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll @@ -1,23 +1,5 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_immoffs_x3: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret void -} - ;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3: ;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42 @@ -72,8 +54,6 @@ main_body: ret void } -declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll deleted file mode 100644 index a8cabdc9b0304..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s - -; GCN-LABEL: {{^}}buffer_store_format_d16_x: -; GCN: s_load_dword s[[LO:[0-9]+]] -; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] -; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_format_d16_xy: - -; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} -; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] -; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 - -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} -; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} - -; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] -; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] - -; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] -; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] - -; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1) -declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll deleted file mode 100644 index 41e2b4d0e5512..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ /dev/null @@ -1,104 +0,0 @@ -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) - ret void -} - -; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll deleted file mode 100644 index 8b18848a62792..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ /dev/null @@ -1,268 +0,0 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc -define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 -define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) - ret void -} - -; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { -main_body: - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { -main_body: - call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { -main_body: - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 8 - %a3 = add i32 %a, 12 - %a4 = add i32 %a, 16 - %a5 = add i32 %a, 28 - %a6 = add i32 %a, 32 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 8 - %a3 = add i32 %a, 12 - %a4 = add i32 %a, 16 - %a5 = add i32 %a, 28 - %a6 = add i32 %a, 32 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 12 - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) { - %a1 = add i32 %a, 28 - %a2 = add i32 %a, 32 - %a3 = add i32 %a, 36 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) { - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 12 - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) { - %a1 = add i32 %a, 4 - %a2 = add i32 %a, 8 - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) { - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) { - call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) { - call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_byte: -;CHECK-NOT: s_waitcnt -;CHECK-NEXT: %bb. -;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8 -define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { -main_body: - %v2 = fptoui float %v1 to i32 - %v3 = trunc i32 %v2 to i8 - call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0) - ret void -} - -;CHECK-LABEL: {{^}}buffer_store_short: -;CHECK-NOT: s_waitcnt -;CHECK-NEXT: %bb. -;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16 -define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { -main_body: - %v2 = fptoui float %v1 to i32 - %v3 = trunc i32 %v2 to i16 - call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0) - ret void -} - -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll deleted file mode 100644 index bd1888b6965a1..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare void @llvm.amdgcn.buffer.wbinvl1() #0 - -; GCN-LABEL: {{^}}test_buffer_wbinvl1: -; GCN-NEXT: ; %bb.0: -; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00] -; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00] -; GCN-NEXT: s_endpgm -define amdgpu_kernel void @test_buffer_wbinvl1() #0 { - call void @llvm.amdgcn.buffer.wbinvl1() - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll deleted file mode 100644 index b937c42e14ed5..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s - -declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0 - -; SI-LABEL: {{^}}test_buffer_wbinvl1_sc: -; SI-NEXT: ; %bb.0: -; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] -; SI-NEXT: s_endpgm -define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 { - call void @llvm.amdgcn.buffer.wbinvl1.sc() - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll deleted file mode 100644 index 64ab8ecefd490..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0 - -; GCN-LABEL: {{^}}test_buffer_wbinvl1_vol: -; GCN-NEXT: ; %bb.0: -; CI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00] -; VI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00] -; GCN: _store_byte -; GCN-NEXT: s_endpgm -define amdgpu_kernel void @test_buffer_wbinvl1_vol(ptr addrspace(1) %ptr) #0 { - call void @llvm.amdgcn.buffer.wbinvl1.vol() -; This used to crash in hazard recognizer - store i8 0, ptr addrspace(1) %ptr, align 1 - ret void -} - -attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll index 98ed437581e1a..47b7658f50cc5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -313,7 +313,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3 main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i32 0, i32 0) %data.i = bitcast <4 x float> %data to <4 x i32> call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0) ret void @@ -620,7 +620,7 @@ declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32 declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll deleted file mode 100644 index 06d66ce9db37d..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s -; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s - -; GCN-LABEL: {{^}}tbuffer_load_d16_x: -; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { -main_body: - %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) - ret half %data -} - -; GCN-LABEL: {{^}}tbuffer_load_d16_xy: -; UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] -define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { -main_body: - %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) - %elt = extractelement <2 x half> %data, i32 1 - ret half %elt -} - -; GCN-LABEL: {{^}}tbuffer_load_d16_xyz: -; UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] -define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) { -main_body: - %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) - %elt = extractelement <3 x half> %data, i32 2 - ret half %elt -} - -; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: -; UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] -define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { -main_body: - %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) - %elt = extractelement <4 x half> %data, i32 3 - ret half %elt -} - -declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll index de929c2893baf..c89c5c5599334 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll @@ -23,18 +23,5 @@ main_body: ret <3 x float> %vdata.f } - -; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3: -; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) { -main_body: - %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <3 x i32> %vdata to <3 x float> - ret <3 x float> %vdata.f -} - declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32) declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32) -declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll deleted file mode 100644 index eb0dc37849b0f..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ /dev/null @@ -1,109 +0,0 @@ -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}tbuffer_load: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; GCN: s_waitcnt -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1) - %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> - %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> - %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 - %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 - %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 - %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 - ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 -} - -; GCN-LABEL: {{^}}tbuffer_load_immoffs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 -define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - ret <4 x float> %vdata.f -} - -; GCN-LABEL: {{^}}tbuffer_load_immoffs_large -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 -; GCN: s_waitcnt -define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) - %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) - %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> - %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> - %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 - %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 - %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 - ret {<4 x float>, <4 x float>, <4 x float>} %r2 -} - -; GCN-LABEL: {{^}}tbuffer_load_idx: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen -define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - ret <4 x float> %vdata.f -} - -; GCN-LABEL: {{^}}tbuffer_load_ofs: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen -define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - ret <4 x float> %vdata.f -} - -; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52 -define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - ret <4 x float> %vdata.f -} - -; GCN-LABEL: {{^}}tbuffer_load_both: -; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen -define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { -main_body: - %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - %vdata.f = bitcast <4 x i32> %vdata to <4 x float> - ret <4 x float> %vdata.f -} - - -; GCN-LABEL: {{^}}buffer_load_xy: -; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { - %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) - %vdata.f = bitcast <2 x i32> %vdata to <2 x float> - ret <2 x float> %vdata.f -} - -; GCN-LABEL: {{^}}buffer_load_x: -; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { - %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) - %vdata.f = bitcast i32 %vdata to float - ret float %vdata.f -} - -declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll deleted file mode 100644 index 16d7de698aa03..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,UNPACKED %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s - - -; GCN-LABEL: {{^}}tbuffer_store_d16_x: -; GCN: s_load_dword s[[S_LO:[0-9]+]] -; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] -; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) { -main_body: - call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}tbuffer_store_d16_xy: -; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} -; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] -; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen - -; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { -main_body: - call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 - -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} -; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} - -; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] -; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen - -; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} -; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] -; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]] -; PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) { -main_body: - call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 - -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} -; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} - -; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] -; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] -; UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen - -; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] -; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] -; PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { -main_body: - call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) - ret void -} - -declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) -declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll index 83b5b0c65c47e..d5cbaddff020b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll @@ -20,16 +20,6 @@ main_body: ret void } -; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3: -; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) { -main_body: - %in1 = bitcast <3 x float> %1 to <3 x i32> - call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) - ret void -} - declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll deleted file mode 100644 index b3a135a35a652..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ /dev/null @@ -1,110 +0,0 @@ -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s -;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}tbuffer_store: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] -; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc -; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] -define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { -main_body: - %in1 = bitcast <4 x float> %1 to <4 x i32> - %in2 = bitcast <4 x float> %2 to <4 x i32> - %in3 = bitcast <4 x float> %3 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0) - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0) - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1) - call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}tbuffer_store_immoffs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { -main_body: - %in1 = bitcast <4 x float> %1 to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: -; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 -define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { -main_body: - %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_idx: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { -main_body: - %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_ofs: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen -define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { -main_body: - %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_both: -; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen -define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { -main_body: - %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0) - ret void -} - -; Ideally, the register allocator would avoid the wait here -; -; GCN-LABEL: {{^}}buffer_store_wait: -; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen -; VERDE: s_waitcnt expcnt(0) -; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen -; GCN: s_waitcnt vmcnt(0) -; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { -main_body: - %in1 = bitcast <4 x float> %vdata to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) - %data.i = bitcast <4 x float> %data to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_x1: -; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen -define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { -main_body: - %data.i = bitcast float %data to i32 - call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0) - ret void -} - -; GCN-LABEL: {{^}}buffer_store_x2: -; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen -define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { -main_body: - %data.i = bitcast <2 x float> %data to <2 x i32> - call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) - ret void -} - -declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } - diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll index 4566865bc7c67..7e9d12241ee36 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll @@ -1,423 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s -; -------------------------------------------------------------------- -; llvm.amdgcn.buffer.load -; -------------------------------------------------------------------- - -define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret float %data -} - -define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <1 x float> [[DATA]] -; - %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <1 x float> %data -} - -define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <2 x float> %data -} - -define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <4 x float> %data -} - -define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <2 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 3 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 -; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 -; CHECK-NEXT: ret { float, float } [[INS1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt1 = extractelement <4 x float> %data, i32 1 - %ins0 = insertvalue { float, float } undef, float %elt0, 0 - %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 - ret { float, float } %ins1 -} - -define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1 -; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0 -; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1 -; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 -; CHECK-NEXT: ret { float, float, float } [[INS2]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt1 = extractelement <4 x float> %data, i32 1 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertvalue { float, float, float } undef, float %elt0, 0 - %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1 - %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2 - ret { float, float, float } %ins2 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> poison, float %elt0, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> poison, float %elt0, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> poison, float %elt2, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <3 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]] -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - -; -------------------------------------------------------------------- -; llvm.amdgcn.buffer.load.format -; -------------------------------------------------------------------- - -define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) -; CHECK-NEXT: ret <1 x float> [[DATA]] -; - %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) - ret <1 x float> %data -} - -define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -; The initial insertion point is at the extractelement -define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> -; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0 -; CHECK-NEXT: ret double [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <2 x double> - %var2 = extractelement <2 x double> %var1, i32 0 - ret double %var2 -} - -define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: ret i32 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <4 x i32> - %var2 = extractelement <4 x i32> %var1, i32 0 - ret i32 %var2 -} - -define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 -; CHECK-NEXT: ret i16 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <8 x i16> - %var2 = extractelement <8 x i16> %var1, i32 0 - ret i16 %var2 -} - -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- @@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]] +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]] ; CHECK-NEXT: ret float [[DATA]] ; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 @@ -4496,248 +4079,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8), declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 -; -------------------------------------------------------------------- -; llvm.amdgcn.tbuffer.load -; -------------------------------------------------------------------- - -define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret float %data -} - -define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret <2 x float> %data -} - -define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret <4 x float> %data -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <2 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 3 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <3 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: ret i32 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %var1 = bitcast <4 x float> %var to <4 x i32> - %var2 = extractelement <4 x i32> %var1, i32 0 - ret i32 %var2 -} - -define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]] -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 - -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 - ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample ; -------------------------------------------------------------------- diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 598175b08315f..af12367b9759c 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -1,423 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s -; -------------------------------------------------------------------- -; llvm.amdgcn.buffer.load -; -------------------------------------------------------------------- - -define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret float %data -} - -define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <1 x float> [[DATA]] -; - %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <1 x float> %data -} - -define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <2 x float> %data -} - -define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - ret <4 x float> %data -} - -define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <2 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 3 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0 -; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1 -; CHECK-NEXT: ret { float, float } [[INS1]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt1 = extractelement <4 x float> %data, i32 1 - %ins0 = insertvalue { float, float } undef, float %elt0, 0 - %ins1 = insertvalue { float, float } %ins0, float %elt1, 1 - ret { float, float } %ins1 -} - -define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0 -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1 -; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0 -; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1 -; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2 -; CHECK-NEXT: ret { float, float, float } [[INS2]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt1 = extractelement <4 x float> %data, i32 1 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertvalue { float, float, float } undef, float %elt0, 0 - %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1 - %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2 - ret { float, float, float } %ins2 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> undef, float %elt0, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> undef, float %elt0, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]] -; CHECK-NEXT: ret <2 x float> [[RET]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt2 = extractelement <4 x float> %data, i32 2 - %ins0 = insertelement <2 x float> undef, float %elt2, i32 0 - %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1 - %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> - %ret = fadd <2 x float> %ins1, %shuf - ret <2 x float> %ret -} - -define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt0 = extractelement <3 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]] -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0 - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - -; -------------------------------------------------------------------- -; llvm.amdgcn.buffer.load.format -; -------------------------------------------------------------------- - -define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @buffer_load_format_v1f32( -; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true) -; CHECK-NEXT: ret <1 x float> [[DATA]] -; - %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true) - ret <1 x float> %data -} - -define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { -; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -; The initial insertion point is at the extractelement -define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double> -; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0 -; CHECK-NEXT: ret double [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <2 x double> - %var2 = extractelement <2 x double> %var1, i32 0 - ret double %var2 -} - -define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: ret i32 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <4 x i32> - %var2 = extractelement <4 x i32> %var1, i32 0 - ret i32 %var2 -} - -define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { -; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16 -; CHECK-NEXT: ret i16 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 - %var1 = bitcast <4 x float> %var to <8 x i16> - %var2 = extractelement <8 x i16> %var1, i32 0 - ret i16 %var2 -} - -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 - ; -------------------------------------------------------------------- ; llvm.amdgcn.raw.buffer.load ; -------------------------------------------------------------------- @@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 { ; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]] +; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]] ; CHECK-NEXT: ret float [[DATA]] ; %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0 @@ -4495,248 +4078,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8), declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1 -; -------------------------------------------------------------------- -; llvm.amdgcn.tbuffer.load -; -------------------------------------------------------------------- - -define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret float %data -} - -define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret <2 x float> %data -} - -define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <4 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - ret <4 x float> %data -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <2 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <4 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <4 x float> %data, i32 3 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <3 x float> [[DATA]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32( -; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: ret <3 x float> [[SHUF]] -; - %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> - ret <3 x float> %shuf -} - -define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt0 = extractelement <3 x float> %data, i32 0 - ret float %elt0 -} - -define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 1 - ret float %elt1 -} - -define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2 -; CHECK-NEXT: ret float [[ELT1]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %elt1 = extractelement <3 x float> %data, i32 2 - ret float %elt1 -} - -define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: ret <2 x float> [[DATA]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32( -; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: ret <2 x float> [[SHUF]] -; - %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> - ret <2 x float> %shuf -} - -define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32( -; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) -; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32 -; CHECK-NEXT: ret i32 [[VAR2]] -; - %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false) - %var1 = bitcast <4 x float> %var to <4 x i32> - %var2 = extractelement <4 x i32> %var1, i32 0 - ret i32 %var2 -} - -define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32( -; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]] -; CHECK-NEXT: ret float [[DATA]] -; - %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0 - %elt0 = extractelement <2 x float> %data, i32 0 - ret float %elt0 -} - -declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 - -declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1 - ; -------------------------------------------------------------------- ; llvm.amdgcn.image.sample ; -------------------------------------------------------------------- diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll index 9cef4a3c7cc0f..65c89618e20ba 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -72,33 +72,6 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in ret void } -define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) { -; GCN-LABEL: @buffer_store_format_insert_zeros_at_end( -; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0 -; GCN-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; GCN-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) -; GCN-NEXT: ret void -; -; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end( -; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 -; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) -; GFX12-NEXT: ret void -; -; GFXUNKNOWN-LABEL: @buffer_store_format_insert_zeros_at_end( -; GFXUNKNOWN-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 -; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 -; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) -; GFXUNKNOWN-NEXT: ret void -; - %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 - %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 - %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 - %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 - call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0) - ret void -} - define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) { ; GCN-LABEL: @struct_buffer_store_format_insert_zeros( ; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 @@ -304,11 +277,9 @@ define amdgpu_ps void @struct_tbuffer_store_argument_insert_first(<4 x i32> inre } declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 -declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll index 7f7790cecb0eb..9ed62d3545f07 100644 --- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll +++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll @@ -1,19 +1,5 @@ ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) -define void @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i1 %bool) { - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) - %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false) - - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i1 %bool - ; CHECK-NEXT: %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) - %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool) - ret void -} - declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) define void @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) { ; CHECK: immarg operand has non-immediate parameter @@ -665,12 +651,3 @@ define void @test_mfma_f32_32x32x1f32(float %arg0, float %arg1, <32 x i32> %arg2 ret void } - -declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) -define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) { - ; CHECK: immarg operand has non-immediate parameter - ; CHECK-NEXT: i1 %slc - ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) - call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc) - ret void -}