Skip to content

Commit 3e6589f

Browse files
[AMDGPU][GFX12] Add 16 bit atomic fadd instructions (#75917)
- image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16
1 parent 28b7e49 commit 3e6589f

29 files changed

+1525
-21
lines changed

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,8 @@
100100
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
101101
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
102102
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103-
// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104-
// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
103+
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
104+
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
105105

106106
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
107107

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
2+
// RUN: %s -S -emit-llvm -o - | FileCheck %s
3+
4+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
5+
// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s
6+
7+
// REQUIRES: amdgpu-registered-target
8+
9+
typedef half __attribute__((ext_vector_type(2))) half2;
10+
typedef short __attribute__((ext_vector_type(2))) short2;
11+
12+
// CHECK-LABEL: test_local_add_2bf16
13+
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
14+
// GFX12-LABEL: test_local_add_2bf16
15+
// GFX12: ds_pk_add_rtn_bf16
16+
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
17+
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
18+
}
19+
20+
// CHECK-LABEL: test_local_add_2bf16_noret
21+
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
22+
// GFX12-LABEL: test_local_add_2bf16_noret
23+
// GFX12: ds_pk_add_bf16
24+
void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
25+
__builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
26+
}
27+
28+
// CHECK-LABEL: test_local_add_2f16
29+
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
30+
// GFX12-LABEL: test_local_add_2f16
31+
// GFX12: ds_pk_add_rtn_f16
32+
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
33+
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
34+
}
35+
36+
// CHECK-LABEL: test_local_add_2f16_noret
37+
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
38+
// GFX12-LABEL: test_local_add_2f16_noret
39+
// GFX12: ds_pk_add_f16
40+
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
41+
__builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
42+
}
43+
44+
// CHECK-LABEL: test_flat_add_2f16
45+
// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}})
46+
// GFX12-LABEL: test_flat_add_2f16
47+
// GFX12: flat_atomic_pk_add_f16
48+
half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
49+
return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
50+
}
51+
52+
// CHECK-LABEL: test_flat_add_2bf16
53+
// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
54+
// GFX12-LABEL: test_flat_add_2bf16
55+
// GFX12: flat_atomic_pk_add_bf16
56+
short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
57+
return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
58+
}
59+
60+
// CHECK-LABEL: test_global_add_half2
61+
// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
62+
// GFX12-LABEL: test_global_add_half2
63+
// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
64+
void test_global_add_half2(__global half2 *addr, half2 x) {
65+
half2 *rtn;
66+
*rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
67+
}
68+
69+
// CHECK-LABEL: test_global_add_half2_noret
70+
// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
71+
// GFX12-LABEL: test_global_add_half2_noret
72+
// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off
73+
void test_global_add_half2_noret(__global half2 *addr, half2 x) {
74+
__builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
75+
}
76+
77+
// CHECK-LABEL: test_global_add_2bf16
78+
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
79+
// GFX12-LABEL: test_global_add_2bf16
80+
// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
81+
void test_global_add_2bf16(__global short2 *addr, short2 x) {
82+
short2 *rtn;
83+
*rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
84+
}
85+
86+
// CHECK-LABEL: test_global_add_2bf16_noret
87+
// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
88+
// GFX12-LABEL: test_global_add_2bf16_noret
89+
// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
90+
void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {
91+
__builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
92+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,9 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
10321032
defm int_amdgcn_image_atomic_cmpswap :
10331033
AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
10341034
AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
1035+
1036+
defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
1037+
defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">;
10351038
}
10361039

10371040
//////////////////////////////////////////////////////////////////////////
@@ -1316,6 +1319,26 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
13161319
// gfx908 intrinsic
13171320
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
13181321
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1322+
// gfx12+ intrinsic
1323+
def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
1324+
[llvm_v2bf16_ty],
1325+
[llvm_v2bf16_ty,
1326+
llvm_v4i32_ty,
1327+
llvm_i32_ty,
1328+
llvm_i32_ty,
1329+
llvm_i32_ty],
1330+
[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1331+
AMDGPURsrcIntrinsic<1, 0>;
1332+
def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
1333+
[llvm_v2bf16_ty],
1334+
[llvm_v2bf16_ty,
1335+
AMDGPUBufferRsrcTy,
1336+
llvm_i32_ty,
1337+
llvm_i32_ty,
1338+
llvm_i32_ty],
1339+
[IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1340+
ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1341+
AMDGPURsrcIntrinsic<1, 0>;
13191342

13201343
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
13211344
[data_ty],
@@ -1392,6 +1415,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
13921415
// gfx908 intrinsic
13931416
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
13941417
def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1418+
// gfx12 intrinsic
1419+
def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
1420+
[llvm_v2bf16_ty],
1421+
[llvm_v2bf16_ty,
1422+
llvm_v4i32_ty,
1423+
llvm_i32_ty,
1424+
llvm_i32_ty,
1425+
llvm_i32_ty,
1426+
llvm_i32_ty],
1427+
[ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1428+
AMDGPURsrcIntrinsic<1, 0>;
1429+
def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
1430+
[llvm_v2bf16_ty],
1431+
[llvm_v2bf16_ty,
1432+
AMDGPUBufferRsrcTy,
1433+
llvm_i32_ty,
1434+
llvm_i32_ty,
1435+
llvm_i32_ty,
1436+
llvm_i32_ty],
1437+
[IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1438+
ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1439+
AMDGPURsrcIntrinsic<1, 0>;
13951440

13961441
// gfx90a intrinsics
13971442
def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,10 @@ def FeatureISAVersion12 : FeatureSet<
14991499
FeatureArchitectedFlatScratch,
15001500
FeatureAtomicFaddRtnInsts,
15011501
FeatureAtomicFaddNoRtnInsts,
1502+
FeatureAtomicDsPkAdd16Insts,
1503+
FeatureAtomicFlatPkAdd16Insts,
1504+
FeatureAtomicBufferGlobalPkAddF16Insts,
1505+
FeatureAtomicGlobalPkAddBF16Inst,
15021506
FeatureFlatAtomicFaddF32Inst,
15031507
FeatureImageInsts,
15041508
FeatureExtendedImageInsts,

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
261261
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
262262
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
263263
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
264+
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
264265
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
265266
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
266267
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5478,6 +5478,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54785478
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
54795479
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
54805480
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5481+
NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
54815482
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
54825483
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
54835484
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ enum NodeType : unsigned {
591591
BUFFER_ATOMIC_CMPSWAP,
592592
BUFFER_ATOMIC_CSUB,
593593
BUFFER_ATOMIC_FADD,
594+
BUFFER_ATOMIC_FADD_BF16,
594595
BUFFER_ATOMIC_FMIN,
595596
BUFFER_ATOMIC_FMAX,
596597
BUFFER_ATOMIC_COND_SUB_U32,

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5883,6 +5883,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
58835883
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
58845884
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
58855885
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5886+
case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5887+
case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5888+
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
58865889
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
58875890
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
58885891
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -6093,6 +6096,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
60936096
Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
60946097
LLT Ty = MRI->getType(VData);
60956098

6099+
const bool IsAtomicPacked16Bit =
6100+
(BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6101+
BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6102+
60966103
// Check for 16 bit addresses and pack if true.
60976104
LLT GradTy =
60986105
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
@@ -6101,7 +6108,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
61016108
const bool IsG16 =
61026109
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
61036110
const bool IsA16 = AddrTy == S16;
6104-
const bool IsD16 = Ty.getScalarType() == S16;
6111+
const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
61056112

61066113
int DMaskLanes = 0;
61076114
if (!BaseOpcode->Atomic) {
@@ -6143,7 +6150,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
61436150
LLT Ty = MRI->getType(VData0);
61446151

61456152
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
6146-
if (Ty.isVector())
6153+
if (Ty.isVector() && !IsAtomicPacked16Bit)
61476154
return false;
61486155

61496156
if (BaseOpcode->AtomicX2) {
@@ -6279,9 +6286,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
62796286
if (NumElts > 4 || DMaskLanes > 4)
62806287
return false;
62816288

6289+
// Image atomic instructions are using DMask to specify how many bits
6290+
// input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6291+
// DMaskLanes for image atomic has default value '0'.
6292+
// We must be sure that atomic variants (especially packed) will not be
6293+
// truncated from v2s16 or v4s16 to s16 type.
6294+
//
6295+
// ChangeElementCount will be needed for image load where Ty is always scalar.
62826296
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
62836297
const LLT AdjustedTy =
6284-
Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6298+
DMaskLanes == 0
6299+
? Ty
6300+
: Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
62856301

62866302
// The raw dword aligned data component of the load. The only legal cases
62876303
// where this matters should be when using the packed D16 format, for
@@ -7101,6 +7117,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
71017117
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
71027118
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
71037119
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7120+
case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7121+
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7122+
case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7123+
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
71047124
return legalizeBufferAtomic(MI, B, IntrID);
71057125
case Intrinsic::trap:
71067126
return legalizeTrapIntrinsic(MI, MRI, B);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3074,6 +3074,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
30743074
return;
30753075
}
30763076
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3077+
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
30773078
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
30783079
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
30793080
applyDefaultMapping(OpdMapper);
@@ -4362,6 +4363,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
43624363
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
43634364
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
43644365
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4366+
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
43654367
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
43664368
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
43674369
// vdata_out

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
280280
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
281281
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
282282
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
283+
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
283284
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
284285
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
285286
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -297,6 +298,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
297298
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
298299
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
299300
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
301+
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
300302
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
301303
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
302304
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -314,6 +316,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
314316
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
315317
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
316318
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
319+
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
317320
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
318321
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
319322
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -331,6 +334,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
331334
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
332335
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
333336
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
337+
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
334338
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
335339
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
336340
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,11 @@ let SubtargetPredicate = isGFX12Plus in {
12451245
defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
12461246
"buffer_atomic_cond_sub_u32", VGPR_32, i32
12471247
>;
1248+
1249+
let FPAtomic = 1 in
1250+
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
1251+
"buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
1252+
>;
12481253
}
12491254

12501255
//===----------------------------------------------------------------------===//
@@ -1711,6 +1716,7 @@ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
17111716
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
17121717

17131718
let SubtargetPredicate = isGFX12Plus in {
1719+
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
17141720
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
17151721

17161722
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
@@ -1781,14 +1787,22 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
17811787
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
17821788

17831789
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
1790+
let SubtargetPredicate = isGFX9Only in
17841791
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
1792+
1793+
let SubtargetPredicate = isGFX12Plus in
1794+
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>;
17851795
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
17861796

17871797
let OtherPredicates = [HasAtomicFaddRtnInsts] in
17881798
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
17891799

17901800
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
1801+
let SubtargetPredicate = isGFX9Only in
17911802
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
1803+
1804+
let SubtargetPredicate = isGFX12Plus in
1805+
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>;
17921806
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
17931807

17941808
let OtherPredicates = [isGFX90APlus] in {
@@ -2645,6 +2659,8 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033,
26452659
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">;
26462660
defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">;
26472661
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">;
2662+
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
2663+
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
26482664

26492665
//===----------------------------------------------------------------------===//
26502666
// MUBUF - GFX10.

0 commit comments

Comments
 (0)