AMDGPU: Fix buffer load/store of pointers #95379

arsenm · 2024-06-13T09:17:22Z

Make sure we test all the address spaces since this support isn't
free in gisel.

arsenm · 2024-06-13T09:17:28Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-06-13T09:20:34Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Make sure we test all the address spaces since this support isn't
free in gisel.

Patch is 38.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95379.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+19-12)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll (+596)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll (+144)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..7a36c88b892c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1112,29 +1112,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
+                                 const DataLayout &DL, Type *Ty,
+                                 unsigned MaxNumLanes) {
   assert(MaxNumLanes != 0);
 
+  LLVMContext &Ctx = Ty->getContext();
   if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
     unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
-    return EVT::getVectorVT(Ty->getContext(),
-                            EVT::getEVT(VT->getElementType()),
+    return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
                             NumElts);
   }
 
-  return EVT::getEVT(Ty);
+  return TLI.getValueType(DL, Ty);
 }
 
 // Peek through TFE struct returns to only use the data size.
-static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
+static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
+                                   const DataLayout &DL, Type *Ty,
+                                   unsigned MaxNumLanes) {
   auto *ST = dyn_cast<StructType>(Ty);
   if (!ST)
-    return memVTFromLoadIntrData(Ty, MaxNumLanes);
+    return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
 
   // TFE intrinsics return an aggregate type.
   assert(ST->getNumContainedTypes() == 2 &&
          ST->getContainedType(1)->isIntegerTy(32));
-  return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
+  return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
 /// Map address space 7 to MVT::v5i32 because that's its in-memory
@@ -1219,10 +1223,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
           MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
         }
 
-        Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
+        Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
+                                             CI.getType(), MaxNumLanes);
       } else {
-        Info.memVT = memVTFromLoadIntrReturn(
-            CI.getType(), std::numeric_limits<unsigned>::max());
+        Info.memVT =
+            memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
+                                    std::numeric_limits<unsigned>::max());
       }
 
       // FIXME: What does alignment mean for an image?
@@ -1235,9 +1241,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       if (RsrcIntr->IsImage) {
         unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
         unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
-        Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
+        Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
+                                           DMaskLanes);
       } else
-        Info.memVT = EVT::getEVT(DataTy);
+        Info.memVT = getValueType(MF.getDataLayout(), DataTy);
 
       Info.flags |= MachineMemOperand::MOStore;
     } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 3e3371091ef72..4d557c76dc4d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -1280,6 +1280,602 @@ define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i
   ret <2 x i64> %data
 }
 
+define ptr @buffer_load_p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p0__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p0__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p0__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr %data
+}
+
+define <2 x ptr> @buffer_load_v2p0__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p0__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p0__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p0__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr> @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr> %data
+}
+
+define ptr addrspace(1) @buffer_load_p1__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p1__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p1__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p1__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr addrspace(1) %data
+}
+
+define <2 x ptr addrspace(1)> @buffer_load_v2p1__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p1__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p1__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p1__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr addrspace(1)> %data
+}
+
+define ptr addrspace(4) @buffer_load_p4__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p4__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p4__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p4__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr addrspace(4) %data
+}
+
+define <2 x ptr addrspace(4)> @buffer_load_v2p4__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p4__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p4__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p4__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr addrspace(4)> @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr addrspace(4)> %data
+}
+
+define ptr addrspace(999) @buffer_load_p999__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p999__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p999__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p999__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr addrspace(999) @llvm.amdgcn.raw.ptr.buffer.load.p999(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr addrspace(999) %data
+}
+
+define <2 x ptr addrspace(999)> @buffer_load_v2p999__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p999__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p999__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p999__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr addrspace(999)> @llvm.amdgcn.raw.ptr.buffer.load.p999(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr addrspace(999)> %data
+}
+
+define ptr addrspace(2) @buffer_load_p2__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p2__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p2__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p2__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr addrspace(2) @llvm.amdgcn.raw.ptr.buffer.load.p2(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr addrspace(2) %data
+}
+
+define <2 x ptr addrspace(2)> @buffer_load_v2p2__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p2__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p2__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p2__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr addrspace(2)> @llvm.amdgcn.raw.ptr.buffer.load.v2p2(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr addrspace(2)> %data
+}
+
+define <3 x ptr addrspace(2)> @buffer_load_v3p2__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; GFX10-LABEL: buffer_load_v3p2__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v3p2__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b96 v[0:2], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <3 x ptr addrspace(2)> @llvm.amdgcn.raw.ptr.buffer.load.v3p2(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <3 x ptr addrspace(2)> %data
+}
+
+define <4 x ptr addrspace(2)> @buffer_load_v4p2__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v4p2__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v4p2__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v4p2__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <4 x ptr addrspace(2)> @llvm.amdgcn.raw.ptr.buffer.load.v4p2(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <4 x ptr addrspace(2)> %data
+}
+
+define ptr addrspace(3) @buffer_load_p3__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_p3__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_p3__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_p3__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call ptr addrspace(3) @llvm.amdgcn.raw.ptr.buffer.load.p3(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret ptr addrspace(3) %data
+}
+
+define <2 x ptr addrspace(3)> @buffer_load_v2p3__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; PREGFX10-LABEL: buffer_load_v2p3__voffset_add:
+; PREGFX10:       ; %bb.0:
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; PREGFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
+; PREGFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_load_v2p3__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v2p3__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %voffset.add = add i32 %voffset, 60
+  %data = call <2 x ptr addrspace(3)> @llvm.amdgcn.raw.ptr.buffer.load.v2p3(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0)
+  ret <2 x ptr addrspace(3)> %data
+}
+
+define <3 x ptr addrspace(3)> @buffer_load_v3p3__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) {
+; GFX10-LABEL: buffer_load_v3p3__voffset_add:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx3 v[0:2], v0, s[4:7], 0 offen offset:60
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_load_v3p3__voffset_add:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buff...
[truncated]

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll

krzysz00 · 2024-06-13T14:35:44Z

So, general question on this patch series:

Wouldn't it be more reasonable to, instead of having separate handling for all the possible register types, always do loads as i8, i16, i32 <2 x i32>, <3 x i32>, or <4 x i32>and thenbitcast/merge_values`/... the results back to their type?

Or at least to have that fallback path - if we don't know what a type is, load/store it as its bits?

(Then we wouldn't need to, for example, go back and add a <16 x i8> case if someone realizes they want that)

arsenm · 2024-06-13T15:11:08Z

That's what we've traditionally done and I think we should stop. We currently skip inserting the casts if the type is legal.

It introduces extra bitcasts, which have a cost and increase pattern match complexity. We have a bunch of patterns that don't bother to look through the casts for a load/store.

Ideally we would have a magic match-n-bitwidth magic type matcher in tablegen

krzysz00 · 2024-06-13T15:19:44Z

Yeah, makes sense.

... what prevents a match-bitwidth operator from existing?

Context from where I'm standing is that you should be able to raw.buffer.load/store any (non-aggregate, let's say, since that could be better handled in addrspace(7) handling) type you could load or store.

That is, raw.ptr.buffer.load.i15 should work (as an i16 load that truncates) as should raw.ptr.buffer.store.v8f32 (or raw.ptr.buffer.store.i256). Sure, the latter are two instructions long, but regular loads can regularize to multiple instructions just fine.

My thoughts on how to implement that second behavior were to split the type into legal chunks and add in the offsets, and then merge/bitcast the values back.

arsenm · 2024-06-13T15:36:26Z

I don't think we should be trying to handle the unreasonable illegal types in the intrinsics themselves. Theoretically the intrinsic should correspond to direct support.

We would handle the ugly types in the fat pointer lowering in terms of the intrinsics.

krzysz00 · 2024-06-13T17:24:28Z

On the other hand, it's a lot easier to handle ugly types down in instruction selection, where you get to play much more fast and loose with types.

And there are buffer uses that don't fit into the fat pointer use use case where we'd still want them to work.
For example, both str uct.ptr.bufferload.v6f16 and struct.ptr.buffer.load.v3f32 should be a buffer_load_dwordx3, but I'm pretty sure 6 x half isn't a register type.

The load and store intrinsics are already overloaded to handle various {8, 16, ..., 128}-bit types, and it seems much cleaner to let it support any type of those lengths. It's just a load/store with somewhat weird indexing semantics, is all.

And then, since we're there ... load i256, ptr addrspace(1) %p legalizes to multiple instructions, and {raw,struct}.ptr.buffer.load.i256(ptr addrspace(8) %p, i32 %offset, ...) should too. It's just a load, after all.

arsenm · 2024-06-14T18:48:31Z

On the other hand, it's a lot easier to handle ugly types down in instruction selection, where you get to play much more fast and loose with types.

I think it's mostly easier to do this in the IR

And there are buffer uses that don't fit into the fat pointer use use case where we'd still want them to work. For example, both str uct.ptr.bufferload.v6f16 and struct.ptr.buffer.load.v3f32 should be a buffer_load_dwordx3, but I'm pretty sure 6 x half isn't a register type.

Yes, we should just fix this one

The load and store intrinsics are already overloaded to handle various {8, 16, ..., 128}-bit types, and it seems much cleaner to let it support any type of those lengths. It's just a load/store with somewhat weird indexing semantics, is all.

Splitting is pretty ugly too, especially for a truly arbitrary type in legalization.

We should just support these for all register types.

Make sure we test all the address spaces since this support isn't free in gisel.

This was referenced Jun 13, 2024

AMDGPU: Fix buffer intrinsic handling for various 16-bit elements. #95376

Merged

AMDGPU: Fix buffer intrinsic store of bfloat #95377

Merged

AMDGPU: Cleanup selection patterns for buffer loads #95378

Merged

arsenm added the backend:AMDGPU label Jun 13, 2024 — with Graphite App

arsenm requested review from jayfoad, krzysz00, piotrAMD and Sisyph June 13, 2024 09:20

arsenm marked this pull request as ready for review June 13, 2024 09:20

jayfoad reviewed Jun 13, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll Show resolved Hide resolved

arsenm force-pushed the users/arsenm/amdgpu-cleanup-buffer-patterns-all-register-types branch from 46c7f8b to 1dfcc09 Compare June 13, 2024 11:02

arsenm force-pushed the users/arsenm/amdgpu-fix-buffer-load-store-pointer branch from b05179e to 1469532 Compare June 13, 2024 11:02

arsenm force-pushed the users/arsenm/amdgpu-cleanup-buffer-patterns-all-register-types branch from 1dfcc09 to 7b7f5d5 Compare June 15, 2024 08:12

arsenm force-pushed the users/arsenm/amdgpu-fix-buffer-load-store-pointer branch from 1469532 to c895288 Compare June 15, 2024 08:12

arsenm added 2 commits June 17, 2024 19:40

AMDGPU: Cleanup selection patterns for buffer loads

5fad2b5

We should just support these for all register types.

AMDGPU: Fix buffer load/store of pointers

cad588c

Make sure we test all the address spaces since this support isn't free in gisel.

arsenm force-pushed the users/arsenm/amdgpu-cleanup-buffer-patterns-all-register-types branch from 7b7f5d5 to 5fad2b5 Compare June 17, 2024 17:43

arsenm force-pushed the users/arsenm/amdgpu-fix-buffer-load-store-pointer branch from c895288 to cad588c Compare June 17, 2024 17:43

Base automatically changed from users/arsenm/amdgpu-cleanup-buffer-patterns-all-register-types to main June 17, 2024 19:51

krzysz00 approved these changes Jun 18, 2024

View reviewed changes

arsenm merged commit 89fd195 into main Jun 18, 2024
6 checks passed

arsenm deleted the users/arsenm/amdgpu-fix-buffer-load-store-pointer branch June 18, 2024 18:52

AlexisPerry pushed a commit to llvm-project-tlp/llvm-project that referenced this pull request Jul 9, 2024

AMDGPU: Fix buffer load/store of pointers (llvm#95379)

4510619

Make sure we test all the address spaces since this support isn't free in gisel.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Fix buffer load/store of pointers #95379

AMDGPU: Fix buffer load/store of pointers #95379

Uh oh!

arsenm commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024 •

edited

Loading

Uh oh!

llvmbot commented Jun 13, 2024

Uh oh!

Uh oh!

krzysz00 commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024 •

edited

Loading

Uh oh!

krzysz00 commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024

Uh oh!

krzysz00 commented Jun 13, 2024 •

edited

Loading

Uh oh!

arsenm commented Jun 14, 2024

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Fix buffer load/store of pointers #95379

AMDGPU: Fix buffer load/store of pointers #95379

Uh oh!

Conversation

arsenm commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 13, 2024

Uh oh!

Uh oh!

krzysz00 commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

krzysz00 commented Jun 13, 2024

Uh oh!

arsenm commented Jun 13, 2024

Uh oh!

krzysz00 commented Jun 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Jun 14, 2024

Uh oh!

Uh oh!

Uh oh!

arsenm commented Jun 13, 2024 •

edited

Loading

arsenm commented Jun 13, 2024 •

edited

Loading

krzysz00 commented Jun 13, 2024 •

edited

Loading