-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AMDGPU] Allow lane-op lowering for some illegal types #114887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Allow lane-op lowering for some illegal types #114887
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesCurrently overloaded lane-op intrinsics only work for legal types. Patch is 27.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114887.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 68ea6f622feca5..09ceb2502e913f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6439,6 +6439,17 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(LoadVal);
return;
}
+ case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_writelane:
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
+ case Intrinsic::amdgcn_mov_dpp8:
+ Results.push_back(lowerLaneOp(*this, N, DAG));
+ return;
}
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 049cc455ab01cb..1d6d2b315bccc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -184,6 +184,25 @@ define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) {
ret void
}
+; GFX10PLUS-LABEL: {{^}}dpp8_i8:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i8 @llvm.amdgcn.mov.dpp8.i8(i8 %in, i32 1)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i1:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_and_b32_e32 v0, 1, v0
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+ %tmp0 = call i1 @llvm.amdgcn.mov.dpp8.i1(i1 %in, i32 1)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aa6069c67f62ee..b1cf33a530b538 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -8770,6 +8770,85 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
ret void
}
+define void @v_permlane16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i8 @llvm.amdgcn.permlane16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i8 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlane16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlane16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
+
define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
; GFX10-SDAG: ; %bb.0:
@@ -9258,3 +9337,82 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
store <8 x i16> %v, ptr addrspace(1) %out
ret void
}
+
+define void @v_permlanex16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i8:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i8 @llvm.amdgcn.permlanex16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i8 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_permlanex16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s4, v3
+; GFX10-NEXT: v_readfirstlane_b32 s5, v4
+; GFX10-NEXT: v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v3
+; GFX11-NEXT: v_readfirstlane_b32 s1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_readfirstlane_b32 s0, v3
+; GFX12-NEXT: v_readfirstlane_b32 s1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT: global_store_b8 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlanex16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 216731519731a0..14e34d7fca8bc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -52,6 +52,33 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
store i32 %v, ptr addrspace(1) %out
ret void
}
+
+define void @test_i16(ptr addrspace(1) %out, i16 %src0) #1 {
+; GFX11-LABEL: test_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
+ store i16 %v, ptr addrspace(1) %out
+ ret void
+}
+
+define void @test_i1(ptr addrspace(1) %out, i1 %src0) #1 {
+; GFX11-LABEL: test_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_permlane64_b32 v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT: global_store_b8 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %v = call i1 @llvm.amdgcn.permlane64.i1(i1 %src0)
+ store i1 %v, ptr addrspace(1) %out
+ ret void
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11-GISEL: {{.*}}
; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 39a3b1c8adc9f1..bf66d9dbaf0565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -700,3 +700,51 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
ret void
}
+
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i8 @llvm.amdgcn.readfirstlane.i8(i8 %in)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %in)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 24a332fa211c15..b33929720ae1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -894,6 +894,54 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
ret void
}
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i8 @llvm.amdgcn.readlane.i8(i8 %in, i32 1)
+ store i8 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp0 = call i1 @llvm.amdgcn.readlane.i1(i1 %in, i32 1)
+ store i1 %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index fbf8c203dcb390..a9a03d3eefc2ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -861,6 +861,68 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
ret void
}
+define amdgpu_cs_chain void @set_inactive_chain_arg_i16(ptr addrspace(1) %out, i16 %inactive, i16 %active) {
+; GFX11-LABEL: set_inactive_chain_arg_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: v_mov_b32_e32 v0, v10
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_or_saveexec_b32 s0, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT: s_mov_b32 exec_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: global_store_b16 v[8:9], v1, off
+; GFX11-NEXT: s_endpgm
+;
+; GFX10-LABEL: set_inactive_chain_arg_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_mov_b32_e32 v0, v10
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: s_or_saveexec_b32 s0, -1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT: s_mov_b32 exec_lo, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: global_store_short v[8:9], v1, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX11_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX11_W64: ; %bb.0:
+; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT: global_store_b16 v[8:9], v1, off
+; GFX11_W64-NEXT: s_endpgm
+;
+; GFX10_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX10_W64: ; %bb.0:
+; GFX10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT: s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT: global_store_short v[8:9], v1, off
+; GFX10_W64-NEXT: s_endpgm
+ %tmp = call i16 @llvm.amdgcn.set.inactive.chain.arg.i16(i16 %active, i16 %inactive) #0
+ %wwm = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp)
+ store i16 %wwm, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6fb5a9ce47a843..b51011ec29699f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -504,6 +504,29 @@ define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(
ret void
}
+define void @set_inactive_i16(ptr addrspace(1) %out, i16 %in) {
+; GCN-LABEL: set_inactive_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v3, 3, v2, s[4:5]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, v3
+; GCN-NEXT: flat_store_short v[0:1], v2
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %tmp.0 = call i16 @llvm.amdgcn.set.inactive.i16(i16 %in, i16 3) #0
+ %tmp = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp.0)
+ store i16 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 837d484583d53f..0cd8e2c60589bf 100644
-...
[truncated]
|
e3017f8
to
7a0daff
Compare
634483a
to
7c4fa7d
Compare
Ping |
2 similar comments
Ping |
Ping |
I think this won't work for arbitrary illegal types like i999, right? It only works for i8 because we have an explicit Otherwise seems fine to me. |
Yes, you are right. Updated description and title. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the point of handling i1/i8?
Clang can produce intrinsic with char argument out of builtin. Will be able after #115090. |
It makes sense to overload and accept the types that will naturally be passed in registers, but I don't think we want to permit cases where there's going to be an implicit cast. Maybe it's OK for i8, but i1 is special and we probably shouldn't treat it like an arbitrary VGPR value |
It's really just tests added for i1. There is nothing in the code specifically checking for data types. |
Actually, I do not see a problem with the generated code. The result for i1 is masked as expected. I am not sure what is really required for this patch. Having some correct result here IMO is better than assert in the BE. |
ping. Do you mind to explain what exactly do you want me to change here? There is no special i1 checking, just tests added. |
7c4fa7d
to
70dd649
Compare
Currently overloaded lane-op intrinsics only work for legal types. It fails with 'Do not know how to promote this operator' with SDag on the i8 type notably. The patch fixes that.
70dd649
to
6cc8a46
Compare
ping |
Currently overloaded lane-op intrinsics only work for legal types.
It fails with 'Do not know how to promote this operator' with SDag
on the i8 type notably. The patch fixes that for types handled with
INTRINSIC_WO_CHAIN..