diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 16021eead0c9f..6f2b385e77c4c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6552,6 +6552,18 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(LoadVal); return; } + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_writelane: + case Intrinsic::amdgcn_permlane16: + case Intrinsic::amdgcn_permlanex16: + case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: + case Intrinsic::amdgcn_mov_dpp8: + case Intrinsic::amdgcn_update_dpp: + Results.push_back(lowerLaneOp(*this, N, DAG)); + return; } break; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 481e721e3c21d..f6ea0b4c36448 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -186,6 +186,25 @@ define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) { ret void } +; GFX10PLUS-LABEL: {{^}}dpp8_i8: +; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off +define amdgpu_ps void @dpp8_i8(i8 %in, ptr addrspace(1) %out) { + %tmp0 = call i8 @llvm.amdgcn.mov.dpp8.i8(i8 %in, i32 1) + store i8 %tmp0, ptr addrspace(1) %out + ret void +} + +; GFX10PLUS-LABEL: {{^}}dpp8_i1: +; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0] +; GFX10PLUS: v_and_b32_e32 v0, 1, v0 +; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off +define amdgpu_ps void @dpp8_i1(i1 %in, ptr addrspace(1) %out) { + %tmp0 = call i1 @llvm.amdgcn.mov.dpp8.i1(i1 %in, i32 1) + store i1 %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 076cf09678b57..794eb85ad1207 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -8932,6 +8932,87 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ret void } +define void @v_permlane16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_byte v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i8 @llvm.amdgcn.permlane16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i8 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlane16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlane16_i1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_byte v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlane16_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlane16_i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i1 @llvm.amdgcn.permlane16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i1 %v, ptr addrspace(1) %out + ret void +} + define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_v2f32: ; GFX10-SDAG: ; %bb.0: @@ -9430,3 +9511,84 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr store <8 x i16> %v, ptr addrspace(1) %out ret void } + +define void @v_permlanex16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: global_store_byte v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_i8: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i8 @llvm.amdgcn.permlanex16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i8 %v, ptr addrspace(1) %out + ret void +} + +define void @v_permlanex16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) { +; GFX10-LABEL: v_permlanex16_i1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_readfirstlane_b32 s4, v3 +; GFX10-NEXT: v_readfirstlane_b32 s5, v4 +; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_byte v[0:1], v2, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_permlanex16_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v3 +; GFX11-NEXT: v_readfirstlane_b32 s1, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_permlanex16_i1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v3 +; GFX12-NEXT: v_readfirstlane_b32 s1, v4 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: global_store_b8 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] + %v = call i1 @llvm.amdgcn.permlanex16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false) + store i1 %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll index f23f9595446eb..49023a126ddf7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -52,6 +52,33 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 { store i32 %v, ptr addrspace(1) %out ret void } + +define void @test_i8(ptr addrspace(1) %out, i8 %src0) #1 { +; GFX11-LABEL: test_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i8 @llvm.amdgcn.permlane64.i8(i8 %src0) + store i8 %v, ptr addrspace(1) %out + ret void +} + +define void @test_i1(ptr addrspace(1) %out, i1 %src0) #1 { +; GFX11-LABEL: test_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_permlane64_b32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %v = call i1 @llvm.amdgcn.permlane64.i1(i1 %src0) + store i1 %v, ptr addrspace(1) %out + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX11-GISEL: {{.*}} ; GFX11-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index a3bd0aabd5c3f..5feaa57687fdf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1729,3 +1729,50 @@ define void @test_readfirstlane_v32f16(ptr addrspace(1) %out, <32 x half> %src) ret void } +define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) { +; CHECK-SDAG-LABEL: dpp8_i8: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: dpp8_i8: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %tmp0 = call i8 @llvm.amdgcn.readfirstlane.i8(i8 %in) + store i8 %tmp0, ptr addrspace(1) %out + ret void +} + +define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) { +; CHECK-SDAG-LABEL: dpp8_i1: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: dpp8_i1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %tmp0 = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %in) + store i1 %tmp0, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index edb6ebcee1325..ececfd831c400 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -894,6 +894,54 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src ret void } +define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) { +; CHECK-SDAG-LABEL: dpp8_i8: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: dpp8_i8: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %tmp0 = call i8 @llvm.amdgcn.readlane.i8(i8 %in, i32 1) + store i8 %tmp0, ptr addrspace(1) %out + ret void +} + +define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) { +; CHECK-SDAG-LABEL: dpp8_i1: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: dpp8_i1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %tmp0 = call i1 @llvm.amdgcn.readlane.i1(i1 %in, i32 1) + store i1 %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index fbf8c203dcb39..a9a03d3eefc2a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -861,6 +861,68 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) % ret void } +define amdgpu_cs_chain void @set_inactive_chain_arg_i16(ptr addrspace(1) %out, i16 %inactive, i16 %active) { +; GFX11-LABEL: set_inactive_chain_arg_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: global_store_b16 v[8:9], v1, off +; GFX11-NEXT: s_endpgm +; +; GFX10-LABEL: set_inactive_chain_arg_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_mov_b32_e32 v0, v10 +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: s_or_saveexec_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0 +; GFX10-NEXT: s_mov_b32 exec_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: global_store_short v[8:9], v1, off +; GFX10-NEXT: s_endpgm +; +; GFX11_W64-LABEL: set_inactive_chain_arg_i16: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10 +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11_W64-NEXT: global_store_b16 v[8:9], v1, off +; GFX11_W64-NEXT: s_endpgm +; +; GFX10_W64-LABEL: set_inactive_chain_arg_i16: +; GFX10_W64: ; %bb.0: +; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10 +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] +; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1] +; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10_W64-NEXT: global_store_short v[8:9], v1, off +; GFX10_W64-NEXT: s_endpgm + %tmp = call i16 @llvm.amdgcn.set.inactive.chain.arg.i16(i16 %active, i16 %inactive) #0 + %wwm = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp) + store i16 %wwm, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 6cb2d6d55ea32..27da57289238e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -504,6 +504,29 @@ define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace( ret void } +define void @set_inactive_i16(ptr addrspace(1) %out, i16 %in) { +; GCN-LABEL: set_inactive_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 3, v2, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v3 +; GCN-NEXT: flat_store_short v[0:1], v2 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %tmp.0 = call i16 @llvm.amdgcn.set.inactive.i16(i16 %in, i16 3) #0 + %tmp = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp.0) + store i16 %tmp, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index a7424831ae5db..a7a2141b4d569 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -576,6 +576,24 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x ret void } +; GCN-LABEL: {{^}}dpp_i8: +; GCN: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} +; GCN: store_{{byte|b8}} v[0:1], v2 +define void @dpp_i8(ptr addrspace(1) %out, i8 %in) { + %tmp0 = call i8 @llvm.amdgcn.update.dpp.i8(i8 %in, i8 %in, i32 1, i32 1, i32 1, i1 false) #0 + store i8 %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_i1: +; GCN: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} +; GCN: store_{{byte|b8}} v[0:1], v2 +define void @dpp_i1(ptr addrspace(1) %out, i1 %in) { + %tmp0 = call i1 @llvm.amdgcn.update.dpp.i8(i1 %in, i1 %in, i32 1, i32 1, i32 1, i1 false) #0 + store i1 %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare void @llvm.amdgcn.s.barrier() declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 6646818b7b36f..3020da9f6d3fc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2700,6 +2700,168 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr ret void } +define void @test_writelane_i8(ptr addrspace(1) %out, i8 %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_i8: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_ubyte v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 +; GFX802-SDAG-NEXT: flat_store_byte v[0:1], v4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_i8: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_ubyte v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: global_store_byte v[0:1], v4, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_i8: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_u8 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: global_store_b8 v[0:1], v4, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_i8: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_ubyte v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: flat_store_byte v[0:1], v4 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_i8: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_ubyte v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: global_store_byte v[0:1], v4, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_i8: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_u8 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: global_store_b8 v[0:1], v4, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load i8, ptr addrspace(1) %out + %writelane = call i8 @llvm.amdgcn.writelane.i8(i8 %src, i32 %src1, i8 %oldval) + store i8 %writelane, ptr addrspace(1) %out + ret void +} + +define void @test_writelane_i1(ptr addrspace(1) %out, i1 %src, i32 %src1) { +; GFX802-SDAG-LABEL: test_writelane_i1: +; GFX802-SDAG: ; %bb.0: +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-SDAG-NEXT: flat_load_ubyte v4, v[0:1] +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; GFX802-SDAG-NEXT: v_readfirstlane_b32 s5, v2 +; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: v_writelane_b32 v4, s5, m0 +; GFX802-SDAG-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX802-SDAG-NEXT: flat_store_byte v[0:1], v2 +; GFX802-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX802-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-SDAG-LABEL: test_writelane_i1: +; GFX1010-SDAG: ; %bb.0: +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-SDAG-NEXT: global_load_ubyte v4, v[0:1], off +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1010-SDAG-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-SDAG-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX1010-SDAG-NEXT: global_store_byte v[0:1], v2, off +; GFX1010-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-LABEL: test_writelane_i1: +; GFX1100-SDAG: ; %bb.0: +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-NEXT: global_load_u8 v4, v[0:1], off +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-SDAG-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX1100-SDAG-NEXT: global_store_b8 v[0:1], v2, off +; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX802-GISEL-LABEL: test_writelane_i1: +; GFX802-GISEL: ; %bb.0: +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX802-GISEL-NEXT: flat_load_ubyte v4, v[0:1] +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 +; GFX802-GISEL-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX802-GISEL-NEXT: flat_store_byte v[0:1], v2 +; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-GISEL-LABEL: test_writelane_i1: +; GFX1010-GISEL: ; %bb.0: +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-GISEL-NEXT: global_load_ubyte v4, v[0:1], off +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1010-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1010-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-GISEL-NEXT: v_writelane_b32 v4, s4, s5 +; GFX1010-GISEL-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX1010-GISEL-NEXT: global_store_byte v[0:1], v2, off +; GFX1010-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-LABEL: test_writelane_i1: +; GFX1100-GISEL: ; %bb.0: +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-NEXT: global_load_u8 v4, v[0:1], off +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 +; GFX1100-GISEL-NEXT: v_and_b32_e32 v2, 1, v4 +; GFX1100-GISEL-NEXT: global_store_b8 v[0:1], v2, off +; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] + %oldval = load i1, ptr addrspace(1) %out + %writelane = call i1 @llvm.amdgcn.writelane.i1(i1 %src, i32 %src1, i1 %oldval) + store i1 %writelane, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind readnone convergent }