Skip to content

Commit dac044a

Browse files
committed
[AMDGPU] Teach CalculateByteProvider about AMDGPUISD::PERM
Change-Id: I7ffca42eb53662e21f649540950660c076f66d9b
1 parent a07d4c0 commit dac044a

File tree

2 files changed

+53
-0
lines changed

2 files changed

+53
-0
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10762,6 +10762,24 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
1076210762
StartingIndex, Index);
1076310763
}
1076410764

10765+
case AMDGPUISD::PERM: {
10766+
auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
10767+
if (!PermMask)
10768+
return std::nullopt;
10769+
10770+
auto IdxMask =
10771+
(PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
10772+
if (IdxMask > 0x07 && IdxMask != 0x0c)
10773+
return std::nullopt;
10774+
10775+
auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
10776+
auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
10777+
10778+
return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
10779+
: ByteProvider<SDValue>(
10780+
ByteProvider<SDValue>::getConstantZero());
10781+
}
10782+
1076510783
default: {
1076610784
return std::nullopt;
1076710785
}

llvm/test/CodeGen/AMDGPU/permute_i8.ll

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2794,6 +2794,41 @@ define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
27942794
ret void
27952795
}
27962796

2797+
declare i32 @llvm.amdgcn.perm(i32, i32, i32)
2798+
2799+
define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
2800+
; GFX10-LABEL: extract_perm_3744:
2801+
; GFX10: ; %bb.0:
2802+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2803+
; GFX10-NEXT: global_load_dword v6, v[0:1], off
2804+
; GFX10-NEXT: global_load_dword v7, v[2:3], off
2805+
; GFX10-NEXT: s_waitcnt vmcnt(0)
2806+
; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2807+
; GFX10-NEXT: global_store_dword v[4:5], v0, off
2808+
; GFX10-NEXT: s_setpc_b64 s[30:31]
2809+
;
2810+
; GFX9-LABEL: extract_perm_3744:
2811+
; GFX9: ; %bb.0:
2812+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813+
; GFX9-NEXT: global_load_dword v6, v[0:1], off
2814+
; GFX9-NEXT: global_load_dword v7, v[2:3], off
2815+
; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2816+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2817+
; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4
2818+
; GFX9-NEXT: global_store_dword v[4:5], v0, off
2819+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2820+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2821+
%vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
2822+
%vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
2823+
%cast1 = bitcast <4 x i8> %vec1 to i32
2824+
%cast2 = bitcast <4 x i8> %vec2 to i32
2825+
%lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200)
2826+
%hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164)
2827+
%res = or i32 %hi8, %lo24
2828+
store i32 %res, ptr addrspace(1) %out0, align 4
2829+
ret void
2830+
}
2831+
27972832
define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
27982833
; GFX10-LABEL: extract1347_v2i16:
27992834
; GFX10: ; %bb.0:

0 commit comments

Comments
 (0)