[AMDGPU] Allow lane-op lowering for some illegal types #114887

rampitec · 2024-11-04T23:02:13Z

Currently overloaded lane-op intrinsics only work for legal types.
It fails with 'Do not know how to promote this operator' with SDag
on the i8 type notably. The patch fixes that for types handled with
INTRINSIC_WO_CHAIN..

rampitec · 2024-11-04T23:02:37Z

[AMDGPU] Simplify dpp builtin handling #115090
[AMDGPU] Allow lane-op lowering for some illegal types #114887 👈 (View in Graphite)
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2024-11-04T23:03:36Z

@llvm/pr-subscribers-backend-amdgpu

Author: Stanislav Mekhanoshin (rampitec)

Changes

Currently overloaded lane-op intrinsics only work for legal types.
It fails with 'Do not know how to promote this operator' with SDag
on the i8 type notably. The patch fixes that.

Patch is 27.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114887.diff

9 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+11)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll (+19)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+158)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll (+27)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+48)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll (+48)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (+62)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+23)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+162)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 68ea6f622feca5..09ceb2502e913f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6439,6 +6439,17 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(LoadVal);
       return;
     }
+    case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_writelane:
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16:
+    case Intrinsic::amdgcn_permlane64:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
+    case Intrinsic::amdgcn_mov_dpp8:
+      Results.push_back(lowerLaneOp(*this, N, DAG));
+      return;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 049cc455ab01cb..1d6d2b315bccc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -184,6 +184,25 @@ define amdgpu_ps void @dpp8_double(double %in, ptr addrspace(1) %out) {
   ret void
 }
 
+; GFX10PLUS-LABEL: {{^}}dpp8_i8:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+  %tmp0 = call i8 @llvm.amdgcn.mov.dpp8.i8(i8 %in, i32 1)
+  store i8 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX10PLUS-LABEL: {{^}}dpp8_i1:
+; GFX10PLUS: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
+; GFX10PLUS: v_and_b32_e32 v0, 1, v0
+; GFX10PLUS: global_store_{{byte|b8}} v[1:2], v0, off
+define amdgpu_ps void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+  %tmp0 = call i1 @llvm.amdgcn.mov.dpp8.i1(i1 %in, i32 1)
+  store i1 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #0
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index aa6069c67f62ee..b1cf33a530b538 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -8770,6 +8770,85 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
   ret void
 }
 
+define void @v_permlane16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i8:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i8 @llvm.amdgcn.permlane16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i8 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlane16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlane16_i1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlane16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlane16_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlane16_i1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i1 @llvm.amdgcn.permlane16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i1 %v, ptr addrspace(1) %out
+  ret void
+}
+
 define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %src1, i32 %src2) {
 ; GFX10-SDAG-LABEL: v_permlanex16_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -9258,3 +9337,82 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
   store <8 x i16> %v, ptr addrspace(1) %out
   ret void
 }
+
+define void @v_permlanex16_i8(ptr addrspace(1) %out, i8 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i8:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i8 @llvm.amdgcn.permlanex16.i8(i8 %src0, i8 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i8 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_permlanex16_i1(ptr addrspace(1) %out, i1 %src0, i32 %src1, i32 %src2) {
+; GFX10-LABEL: v_permlanex16_i1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX10-NEXT:    v_permlanex16_b32 v2, v2, s4, s5
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    global_store_byte v[0:1], v2, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_permlanex16_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_permlanex16_i1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX12-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX12-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i1 @llvm.amdgcn.permlanex16.i1(i1 %src0, i1 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
+  store i1 %v, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index 216731519731a0..14e34d7fca8bc8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -52,6 +52,33 @@ define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
   store i32 %v, ptr addrspace(1) %out
   ret void
 }
+
+define void @test_i16(ptr addrspace(1) %out, i16 %src0) #1 {
+; GFX11-LABEL: test_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane64.i16(i16 %src0)
+  store i16 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define void @test_i1(ptr addrspace(1) %out, i1 %src0) #1 {
+; GFX11-LABEL: test_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_permlane64_b32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %v = call i1 @llvm.amdgcn.permlane64.i1(i1 %src0)
+  store i1 %v, ptr addrspace(1) %out
+  ret void
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-GISEL: {{.*}}
 ; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 39a3b1c8adc9f1..bf66d9dbaf0565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -700,3 +700,51 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
   call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
   ret void
 }
+
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i8 @llvm.amdgcn.readfirstlane.i8(i8 %in)
+  store i8 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; CHECK-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %in)
+  store i1 %tmp0, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 24a332fa211c15..b33929720ae1ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -894,6 +894,54 @@ define void @test_readlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src
   ret void
 }
 
+define void @dpp8_i8(i8 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i8:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i8:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i8 @llvm.amdgcn.readlane.i8(i8 %in, i32 1)
+  store i8 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+define void @dpp8_i1(i1 %in, ptr addrspace(1) %out) {
+; CHECK-SDAG-LABEL: dpp8_i1:
+; CHECK-SDAG:       ; %bb.0:
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_readlane_b32 s4, v0, 1
+; CHECK-SDAG-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-SDAG-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: dpp8_i1:
+; CHECK-GISEL:       ; %bb.0:
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT:    v_readlane_b32 s4, v0, 1
+; CHECK-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-GISEL-NEXT:    flat_store_byte v[1:2], v0
+; CHECK-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = call i1 @llvm.amdgcn.readlane.i1(i1 %in, i32 1)
+  store i1 %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index fbf8c203dcb390..a9a03d3eefc2ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -861,6 +861,68 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
   ret void
 }
 
+define amdgpu_cs_chain void @set_inactive_chain_arg_i16(ptr addrspace(1) %out, i16 %inactive, i16 %active) {
+; GFX11-LABEL: set_inactive_chain_arg_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    global_store_b16 v[8:9], v1, off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX10-LABEL: set_inactive_chain_arg_i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s0
+; GFX10-NEXT:    s_mov_b32 exec_lo, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    global_store_short v[8:9], v1, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX11_W64:       ; %bb.0:
+; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11_W64-NEXT:    global_store_b16 v[8:9], v1, off
+; GFX11_W64-NEXT:    s_endpgm
+;
+; GFX10_W64-LABEL: set_inactive_chain_arg_i16:
+; GFX10_W64:       ; %bb.0:
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX10_W64-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
+; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10_W64-NEXT:    global_store_short v[8:9], v1, off
+; GFX10_W64-NEXT:    s_endpgm
+  %tmp = call i16 @llvm.amdgcn.set.inactive.chain.arg.i16(i16 %active, i16 %inactive) #0
+  %wwm = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp)
+  store i16 %wwm, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6fb5a9ce47a843..b51011ec29699f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -504,6 +504,29 @@ define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(
   ret void
 }
 
+define void @set_inactive_i16(ptr addrspace(1) %out, i16 %in) {
+; GCN-LABEL: set_inactive_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 3, v2, s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v3
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %tmp.0 = call i16 @llvm.amdgcn.set.inactive.i16(i16 %in, i16 3) #0
+  %tmp = call i16 @llvm.amdgcn.strict.wwm.i16(i16 %tmp.0)
+  store i16 %tmp, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
 declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 837d484583d53f..0cd8e2c60589bf 100644
-...
[truncated]

rampitec · 2024-11-08T09:22:45Z

Ping

rampitec · 2024-11-11T16:41:43Z

Ping

rampitec · 2024-12-02T21:37:43Z

Ping

jayfoad · 2024-12-03T09:50:46Z

I think this won't work for arbitrary illegal types like i999, right? It only works for i8 because we have an explicit setOperationAction(ISD::INTRINSIC_WO_CHAIN, ... i8 ..., Custom). So update the description.

Otherwise seems fine to me.

rampitec · 2024-12-03T16:35:26Z

I think this won't work for arbitrary illegal types like i999, right? It only works for i8 because we have an explicit setOperationAction(ISD::INTRINSIC_WO_CHAIN, ... i8 ..., Custom). So update the description.

Otherwise seems fine to me.

Yes, you are right. Updated description and title.

arsenm

What's the point of handling i1/i8?

rampitec · 2024-12-03T17:15:10Z

What's the point of handling i1/i8?

Clang can produce intrinsic with char argument out of builtin. Will be able after #115090.

arsenm · 2024-12-06T16:52:11Z

Clang can produce intrinsic with char argument out of builtin. Will be able after #115090.

It makes sense to overload and accept the types that will naturally be passed in registers, but I don't think we want to permit cases where there's going to be an implicit cast. Maybe it's OK for i8, but i1 is special and we probably shouldn't treat it like an arbitrary VGPR value

rampitec · 2024-12-06T17:04:55Z

Clang can produce intrinsic with char argument out of builtin. Will be able after #115090.

It makes sense to overload and accept the types that will naturally be passed in registers, but I don't think we want to permit cases where there's going to be an implicit cast. Maybe it's OK for i8, but i1 is special and we probably shouldn't treat it like an arbitrary VGPR value

It's really just tests added for i1. There is nothing in the code specifically checking for data types.

rampitec · 2024-12-18T09:48:06Z

Clang can produce intrinsic with char argument out of builtin. Will be able after #115090.

It makes sense to overload and accept the types that will naturally be passed in registers, but I don't think we want to permit cases where there's going to be an implicit cast. Maybe it's OK for i8, but i1 is special and we probably shouldn't treat it like an arbitrary VGPR value

It's really just tests added for i1. There is nothing in the code specifically checking for data types.

Actually, I do not see a problem with the generated code. The result for i1 is masked as expected. I am not sure what is really required for this patch. Having some correct result here IMO is better than assert in the BE.

rampitec · 2025-01-17T11:25:05Z

Clang can produce intrinsic with char argument out of builtin. Will be able after #115090.

It makes sense to overload and accept the types that will naturally be passed in registers, but I don't think we want to permit cases where there's going to be an implicit cast. Maybe it's OK for i8, but i1 is special and we probably shouldn't treat it like an arbitrary VGPR value

It's really just tests added for i1. There is nothing in the code specifically checking for data types.

Actually, I do not see a problem with the generated code. The result for i1 is masked as expected. I am not sure what is really required for this patch. Having some correct result here IMO is better than assert in the BE.

ping. Do you mind to explain what exactly do you want me to change here? There is no special i1 checking, just tests added.

Currently overloaded lane-op intrinsics only work for legal types. It fails with 'Do not know how to promote this operator' with SDag on the i8 type notably. The patch fixes that.

rampitec · 2025-03-03T23:00:33Z

ping

rampitec requested a review from arsenm November 4, 2024 23:02

rampitec marked this pull request as ready for review November 4, 2024 23:03

llvmbot added the backend:AMDGPU label Nov 4, 2024

rampitec force-pushed the users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types branch 3 times, most recently from e3017f8 to 7a0daff Compare November 5, 2024 23:23

rampitec mentioned this pull request Nov 5, 2024

[AMDGPU] Simplify dpp builtin handling #115090

Open

rampitec force-pushed the users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types branch 5 times, most recently from 634483a to 7c4fa7d Compare November 8, 2024 00:35

rampitec changed the title ~~[AMDGPU] Allow lane-op lowering for illegal types~~ [AMDGPU] Allow lane-op lowering for some illegal types Dec 3, 2024

rampitec requested a review from jayfoad December 3, 2024 16:35

arsenm reviewed Dec 3, 2024

View reviewed changes

rampitec force-pushed the users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types branch from 7c4fa7d to 70dd649 Compare March 1, 2025 11:21

rampitec added 2 commits March 3, 2025 10:41

[AMDGPU] Allow lane-op lowering for illegal types

9f02e25

Currently overloaded lane-op intrinsics only work for legal types. It fails with 'Do not know how to promote this operator' with SDag on the i8 type notably. The patch fixes that.

Added amdgcn_update_dpp

6cc8a46

rampitec force-pushed the users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types branch from 70dd649 to 6cc8a46 Compare March 3, 2025 22:59

rampitec mentioned this pull request Mar 4, 2025

AMDGPU: Reduce readfirstlane for single demanded vector element #128647

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Allow lane-op lowering for some illegal types #114887

[AMDGPU] Allow lane-op lowering for some illegal types #114887

rampitec commented Nov 4, 2024 •

edited

Loading

Uh oh!

rampitec commented Nov 4, 2024 •

edited

Loading

Uh oh!

llvmbot commented Nov 4, 2024

Uh oh!

rampitec commented Nov 8, 2024

Uh oh!

rampitec commented Nov 11, 2024

Uh oh!

rampitec commented Dec 2, 2024

Uh oh!

jayfoad commented Dec 3, 2024

Uh oh!

rampitec commented Dec 3, 2024

Uh oh!

arsenm left a comment

Uh oh!

rampitec commented Dec 3, 2024

Uh oh!

arsenm commented Dec 6, 2024

Uh oh!

rampitec commented Dec 6, 2024

Uh oh!

rampitec commented Dec 18, 2024

Uh oh!

rampitec commented Jan 17, 2025

Uh oh!

rampitec commented Mar 3, 2025

Uh oh!

Uh oh!

[AMDGPU] Allow lane-op lowering for some illegal types #114887

Are you sure you want to change the base?

[AMDGPU] Allow lane-op lowering for some illegal types #114887

Conversation

rampitec commented Nov 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rampitec commented Nov 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 4, 2024

Uh oh!

rampitec commented Nov 8, 2024

Uh oh!

rampitec commented Nov 11, 2024

Uh oh!

rampitec commented Dec 2, 2024

Uh oh!

jayfoad commented Dec 3, 2024

Uh oh!

rampitec commented Dec 3, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

rampitec commented Dec 3, 2024

Uh oh!

arsenm commented Dec 6, 2024

Uh oh!

rampitec commented Dec 6, 2024

Uh oh!

rampitec commented Dec 18, 2024

Uh oh!

rampitec commented Jan 17, 2025

Uh oh!

rampitec commented Mar 3, 2025

Uh oh!

Uh oh!

rampitec commented Nov 4, 2024 •

edited

Loading

rampitec commented Nov 4, 2024 •

edited

Loading