Skip to content

Commit b6fa394

Browse files
committed
clang/AMDGPU: Emit atomicrmw from ds_fadd builtins
We should have done this for the f32/f64 case a long time ago. Now that codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting it instead. This also does upgrade the behavior to respect a volatile qualified pointer, which was previously ignored (for the cases that don't have an explicit volatile argument).
1 parent c516231 commit b6fa394

11 files changed

+139
-62
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 71 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
1814018140
break;
1814118141
}
1814218142

18143+
// Some of the atomic builtins take the scope as a string name.
1814318144
StringRef scp;
18144-
llvm::getConstantStringInfo(Scope, scp);
18145-
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
18145+
if (llvm::getConstantStringInfo(Scope, scp)) {
18146+
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
18147+
return;
18148+
}
18149+
18150+
// Older builtins had an enum argument for the memory scope.
18151+
int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
18152+
switch (scope) {
18153+
case 0: // __MEMORY_SCOPE_SYSTEM
18154+
SSID = llvm::SyncScope::System;
18155+
break;
18156+
case 1: // __MEMORY_SCOPE_DEVICE
18157+
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
18158+
break;
18159+
case 2: // __MEMORY_SCOPE_WRKGRP
18160+
SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
18161+
break;
18162+
case 3: // __MEMORY_SCOPE_WVFRNT
18163+
SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
18164+
break;
18165+
case 4: // __MEMORY_SCOPE_SINGLE
18166+
SSID = llvm::SyncScope::SingleThread;
18167+
break;
18168+
default:
18169+
SSID = llvm::SyncScope::System;
18170+
break;
18171+
}
1814618172
}
1814718173

1814818174
llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
@@ -18558,14 +18584,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1855818584
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1855918585
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1856018586
}
18561-
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1856218587
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1856318588
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
1856418589
Intrinsic::ID Intrin;
1856518590
switch (BuiltinID) {
18566-
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
18567-
Intrin = Intrinsic::amdgcn_ds_fadd;
18568-
break;
1856918591
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
1857018592
Intrin = Intrinsic::amdgcn_ds_fmin;
1857118593
break;
@@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1865618678
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
1865718679
return Builder.CreateCall(F, {Addr, Val});
1865818680
}
18659-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18660-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18661-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
18662-
Intrinsic::ID IID;
18663-
llvm::Type *ArgTy;
18664-
switch (BuiltinID) {
18665-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18666-
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18667-
IID = Intrinsic::amdgcn_ds_fadd;
18668-
break;
18669-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18670-
ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18671-
IID = Intrinsic::amdgcn_ds_fadd;
18672-
break;
18673-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
18674-
ArgTy = llvm::FixedVectorType::get(
18675-
llvm::Type::getHalfTy(getLLVMContext()), 2);
18676-
IID = Intrinsic::amdgcn_ds_fadd;
18677-
break;
18678-
}
18679-
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18680-
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18681-
llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
18682-
llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
18683-
llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
18684-
llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
18685-
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
18686-
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
18687-
}
1868818681
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
1868918682
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
1869018683
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -19044,7 +19037,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1904419037
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
1904519038
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1904619039
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
19047-
case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
19040+
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
19041+
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19042+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
19043+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19044+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19045+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
1904819046
llvm::AtomicRMWInst::BinOp BinOp;
1904919047
switch (BuiltinID) {
1905019048
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19055,23 +19053,54 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1905519053
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
1905619054
BinOp = llvm::AtomicRMWInst::UDecWrap;
1905719055
break;
19056+
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19057+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
19058+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
19059+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19060+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19061+
BinOp = llvm::AtomicRMWInst::FAdd;
19062+
break;
1905819063
}
1905919064

1906019065
Address Ptr = CheckAtomicAlignment(*this, E);
1906119066
Value *Val = EmitScalarExpr(E->getArg(1));
19067+
llvm::Type *OrigTy = Val->getType();
19068+
QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
1906219069

19063-
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19064-
EmitScalarExpr(E->getArg(3)), AO, SSID);
19070+
bool Volatile;
1906519071

19066-
QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
19067-
bool Volatile =
19068-
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
19072+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
19073+
// __builtin_amdgcn_ds_faddf has an explicit volatile argument
19074+
Volatile =
19075+
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
19076+
} else {
19077+
// Infer volatile from the passed type.
19078+
Volatile =
19079+
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
19080+
}
19081+
19082+
if (E->getNumArgs() >= 4) {
19083+
// Some of the builtins have explicit ordering and scope arguments.
19084+
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
19085+
EmitScalarExpr(E->getArg(3)), AO, SSID);
19086+
} else {
19087+
// The ds_fadd_* builtins do not have syncscope/order arguments.
19088+
SSID = llvm::SyncScope::System;
19089+
AO = AtomicOrdering::SequentiallyConsistent;
19090+
19091+
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
19092+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
19093+
llvm::Type *V2BF16Ty = FixedVectorType::get(
19094+
llvm::Type::getBFloatTy(Builder.getContext()), 2);
19095+
Val = Builder.CreateBitCast(Val, V2BF16Ty);
19096+
}
19097+
}
1906919098

1907019099
llvm::AtomicRMWInst *RMW =
1907119100
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
1907219101
if (Volatile)
1907319102
RMW->setVolatile(true);
19074-
return RMW;
19103+
return Builder.CreateBitCast(RMW, OrigTy);
1907519104
}
1907619105
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
1907719106
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {

clang/test/CodeGenCUDA/builtins-amdgcn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ __global__
115115
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
116116
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
117117
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
118-
// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
118+
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
119119
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
120120
// CHECK-NEXT: ret void
121121
//

clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ __global__
112112
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
113113
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
114114
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
115-
// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
115+
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
116116
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
117117
// CHECK-NEXT: ret void
118118
//

clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ typedef __attribute__((address_space(3))) float *LP;
1111
// CHECK: store ptr %addr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
1212
// CHECK: %[[ADDR_ADDR_ASCAST:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
1313
// CHECK: %[[AS_CAST:.*]] = addrspacecast ptr %[[ADDR_ADDR_ASCAST]] to ptr addrspace(3)
14-
// CHECK: %3 = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[AS_CAST]]
14+
// CHECK: [[TMP2:%.+]] = load float, ptr %val.addr.ascast, align 4
15+
// CHECK: [[TMP3:%.+]] = atomicrmw fadd ptr addrspace(3) %[[AS_CAST]], float [[TMP2]] monotonic, align 4
1516
// CHECK: %4 = load ptr, ptr %rtn.ascast, align 8
16-
// CHECK: store float %3, ptr %4, align 4
17+
// CHECK: store float [[TMP3]], ptr %4, align 4
1718
__device__ void test_ds_atomic_add_f32(float *addr, float val) {
1819
float *rtn;
1920
*rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0);

clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ typedef __attribute__((address_space(3))) float *LP;
2020
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8
2121
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
2222
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4
23-
// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false)
23+
// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
2424
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8
2525
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
2626
// CHECK-NEXT: ret void

clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,44 @@ void test_update_dpp(global int* out, int arg1, int arg2)
117117
}
118118

119119
// CHECK-LABEL: @test_ds_fadd
120-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
120+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
121+
// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
122+
123+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
124+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
125+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}}
126+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
127+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
128+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
129+
130+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
131+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
132+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
133+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
134+
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
121135
#if !defined(__SPIRV__)
122136
void test_ds_faddf(local float *out, float src) {
123137
#else
124-
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
138+
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
125139
#endif
126-
*out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
140+
141+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, false);
142+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, true);
143+
144+
// Test all orders.
145+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
146+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
147+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
148+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
149+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
150+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
151+
152+
// Test all syncscopes.
153+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
154+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
155+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
156+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
157+
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
127158
}
128159

129160
// CHECK-LABEL: @test_ds_fmin

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,31 +10,37 @@ typedef half __attribute__((ext_vector_type(2))) half2;
1010
typedef short __attribute__((ext_vector_type(2))) short2;
1111

1212
// CHECK-LABEL: test_local_add_2bf16
13-
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
13+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
14+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
15+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
16+
1417
// GFX12-LABEL: test_local_add_2bf16
1518
// GFX12: ds_pk_add_rtn_bf16
1619
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
1720
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
1821
}
1922

2023
// CHECK-LABEL: test_local_add_2bf16_noret
21-
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
24+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
25+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
26+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
27+
2228
// GFX12-LABEL: test_local_add_2bf16_noret
2329
// GFX12: ds_pk_add_bf16
2430
void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
2531
__builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
2632
}
2733

2834
// CHECK-LABEL: test_local_add_2f16
29-
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
35+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
3036
// GFX12-LABEL: test_local_add_2f16
3137
// GFX12: ds_pk_add_rtn_f16
3238
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
3339
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
3440
}
3541

3642
// CHECK-LABEL: test_local_add_2f16_noret
37-
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
43+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
3844
// GFX12-LABEL: test_local_add_2f16_noret
3945
// GFX12: ds_pk_add_f16
4046
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,18 @@
66
// REQUIRES: amdgpu-registered-target
77

88
// CHECK-LABEL: test_fadd_local
9-
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false)
9+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
1010
// GFX8-LABEL: test_fadd_local$local:
1111
// GFX8: ds_add_rtn_f32 v2, v0, v1
1212
// GFX8: s_endpgm
1313
kernel void test_fadd_local(__local float *ptr, float val){
1414
float *res;
1515
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
1616
}
17+
18+
// CHECK-LABEL: test_fadd_local_volatile
19+
// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
20+
kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
21+
volatile float *res;
22+
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
23+
}

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
9999
}
100100

101101
// CHECK-LABEL: test_ds_add_local_f64
102-
// CHECK: call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %{{.*}}, double %{{.*}},
102+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
103103
// GFX90A: test_ds_add_local_f64$local
104104
// GFX90A: ds_add_rtn_f64
105105
void test_ds_add_local_f64(__local double *addr, double x){
@@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
108108
}
109109

110110
// CHECK-LABEL: test_ds_addf_local_f32
111-
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}},
111+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
112112
// GFX90A-LABEL: test_ds_addf_local_f32$local
113113
// GFX90A: ds_add_rtn_f32
114114
void test_ds_addf_local_f32(__local float *addr, float x){

clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,27 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
4242
}
4343

4444
// CHECK-LABEL: test_local_add_2bf16
45-
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
45+
46+
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
47+
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
48+
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
49+
4650
// GFX940-LABEL: test_local_add_2bf16
4751
// GFX940: ds_pk_add_rtn_bf16
4852
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
4953
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
5054
}
5155

5256
// CHECK-LABEL: test_local_add_2f16
53-
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
57+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
5458
// GFX940-LABEL: test_local_add_2f16
5559
// GFX940: ds_pk_add_rtn_f16
5660
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
5761
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
5862
}
5963

6064
// CHECK-LABEL: test_local_add_2f16_noret
61-
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
65+
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
6266
// GFX940-LABEL: test_local_add_2f16_noret
6367
// GFX940: ds_pk_add_f16
6468
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2973,8 +2973,7 @@ def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
29732973
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
29742974
[llvm_v2i16_ty],
29752975
[LLVMQualPointerType<3>, llvm_v2i16_ty],
2976-
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
2977-
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
2976+
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
29782977

29792978
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
29802979
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;

0 commit comments

Comments
 (0)