Skip to content

Commit 8f63d15

Browse files
authored
clang/AMDGPU: Use atomicrmw for ds fmin/fmax builtins (#96738)
1 parent 1de1818 commit 8f63d15

File tree

4 files changed

+86
-36
lines changed

4 files changed

+86
-36
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

+14-26
Original file line numberDiff line numberDiff line change
@@ -18632,28 +18632,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1863218632
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
1863318633
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
1863418634
}
18635-
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18636-
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
18637-
Intrinsic::ID Intrin;
18638-
switch (BuiltinID) {
18639-
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18640-
Intrin = Intrinsic::amdgcn_ds_fmin;
18641-
break;
18642-
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
18643-
Intrin = Intrinsic::amdgcn_ds_fmax;
18644-
break;
18645-
}
18646-
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18647-
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18648-
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18649-
llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
18650-
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
18651-
llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
18652-
llvm::FunctionType *FTy = F->getFunctionType();
18653-
llvm::Type *PTy = FTy->getParamType(0);
18654-
Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
18655-
return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
18656-
}
1865718635
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
1865818636
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
1865918637
case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
@@ -19087,11 +19065,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1908719065
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
1908819066
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
1908919067
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
19090-
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
1909119068
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
1909219069
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
1909319070
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
19094-
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
19071+
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
19072+
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
19073+
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19074+
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
1909519075
llvm::AtomicRMWInst::BinOp BinOp;
1909619076
switch (BuiltinID) {
1909719077
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
@@ -19109,6 +19089,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1910919089
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
1911019090
BinOp = llvm::AtomicRMWInst::FAdd;
1911119091
break;
19092+
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
19093+
BinOp = llvm::AtomicRMWInst::FMin;
19094+
break;
19095+
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
19096+
BinOp = llvm::AtomicRMWInst::FMax;
19097+
break;
1911219098
}
1911319099

1911419100
Address Ptr = CheckAtomicAlignment(*this, E);
@@ -19118,8 +19104,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1911819104

1911919105
bool Volatile;
1912019106

19121-
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
19122-
// __builtin_amdgcn_ds_faddf has an explicit volatile argument
19107+
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf ||
19108+
BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fminf ||
19109+
BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_fmaxf) {
19110+
// __builtin_amdgcn_ds_faddf/fminf/fmaxf has an explicit volatile argument
1912319111
Volatile =
1912419112
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
1912519113
} else {

clang/test/CodeGenCUDA/builtins-amdgcn.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ __global__
9898
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
9999
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
100100
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
101-
// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
101+
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
102102
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
103103
// CHECK-NEXT: ret void
104104
//
@@ -142,7 +142,7 @@ __global__ void test_ds_fadd(float src) {
142142
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
143143
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
144144
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
145-
// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
145+
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
146146
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
147147
// CHECK-NEXT: ret void
148148
//
@@ -245,10 +245,10 @@ __device__ void func(float *x);
245245
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
246246
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
247247
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
248-
// CHECK-NEXT: [[TMP4:%.*]] = call contract float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
248+
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
249249
// CHECK-NEXT: store volatile float [[TMP4]], ptr [[X_ASCAST]], align 4
250250
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[SHARED_ADDR_ASCAST]], align 8
251-
// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR8:[0-9]+]]
251+
// CHECK-NEXT: call void @_Z4funcPf(ptr noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
252252
// CHECK-NEXT: ret void
253253
//
254254
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {

clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu

+4-4
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ __global__
9595
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
9696
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
9797
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
98-
// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
98+
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fmax ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]] monotonic, align 4
9999
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
100100
// CHECK-NEXT: ret void
101101
//
@@ -139,7 +139,7 @@ __global__ void test_ds_fadd(float src) {
139139
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
140140
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
141141
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
142-
// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
142+
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
143143
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
144144
// CHECK-NEXT: ret void
145145
//
@@ -236,10 +236,10 @@ __device__ void func(float *x);
236236
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
237237
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3)
238238
// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
239-
// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false)
239+
// CHECK-NEXT: [[TMP4:%.*]] = atomicrmw fmin ptr addrspace(3) [[TMP2]], float [[TMP3]] monotonic, align 4
240240
// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4
241241
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8
242-
// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]]
242+
// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR6:[0-9]+]]
243243
// CHECK-NEXT: ret void
244244
//
245245
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {

clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl

+64-2
Original file line numberDiff line numberDiff line change
@@ -158,23 +158,85 @@ void test_ds_faddf(local float *out, float src) {
158158
}
159159

160160
// CHECK-LABEL: @test_ds_fmin
161-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
161+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
162+
// CHECK: atomicrmw volatile fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
163+
164+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
165+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acquire, align 4{{$}}
166+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src release, align 4{{$}}
167+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
168+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
169+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
170+
171+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
172+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
173+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
174+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
175+
// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
176+
162177
#if !defined(__SPIRV__)
163178
void test_ds_fminf(local float *out, float src) {
164179
#else
165180
void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
166181
#endif
167182
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false);
183+
*out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, true);
184+
185+
// Test all orders.
186+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
187+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
188+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
189+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
190+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
191+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
192+
193+
// Test all syncscopes.
194+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
195+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
196+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
197+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
198+
*out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
168199
}
169200

170201
// CHECK-LABEL: @test_ds_fmax
171-
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
202+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
203+
// CHECK: atomicrmw volatile fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
204+
205+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
206+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acquire, align 4{{$}}
207+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src release, align 4{{$}}
208+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
209+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
210+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
211+
212+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
213+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
214+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
215+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
216+
// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
217+
172218
#if !defined(__SPIRV__)
173219
void test_ds_fmaxf(local float *out, float src) {
174220
#else
175221
void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
176222
#endif
177223
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false);
224+
*out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, true);
225+
226+
// Test all orders.
227+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
228+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
229+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
230+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
231+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
232+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
233+
234+
// Test all syncscopes.
235+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
236+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
237+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
238+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
239+
*out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
178240
}
179241

180242
// CHECK-LABEL: @test_s_memtime

0 commit comments

Comments
 (0)