Skip to content

[NVPTX] Support __usAtomicCAS builtin #99646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,9 @@ BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n")
TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)

TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n")
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60)
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60)
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20344,6 +20344,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_min_gen_ull:
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);

case NVPTX::BI__nvvm_atom_cas_gen_us:
case NVPTX::BI__nvvm_atom_cas_gen_i:
case NVPTX::BI__nvvm_atom_cas_gen_l:
case NVPTX::BI__nvvm_atom_cas_gen_ll:
Expand Down Expand Up @@ -20535,6 +20536,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
Expand All @@ -20546,6 +20548,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
{Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
}
case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
Expand Down
14 changes: 14 additions & 0 deletions clang/lib/Headers/__clang_cuda_device_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,20 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
__DEVICE__ unsigned short
__usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p,
unsigned short __cmp,
unsigned short __v) {
return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p,
unsigned short __cmp,
unsigned short __v) {
return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v);
}
__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
return __nvvm_atom_add_gen_i((int *)__p, __v);
}
Expand Down
16 changes: 15 additions & 1 deletion clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
Expand Down Expand Up @@ -238,7 +241,8 @@ __shared__ long long sll;

// Check for atomic intrinsics
// CHECK-LABEL: nvvm_atom
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
unsigned short *usp, unsigned short us, int *ip,
int i, unsigned int *uip, unsigned ui, long *lp,
long l, long long *llp, long long ll) {
// CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
Expand Down Expand Up @@ -577,6 +581,16 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
__nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
#endif

#if __CUDA_ARCH__ >= 700
// CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
// CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_us(usp, 0, us);
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
__nvvm_atom_cta_cas_gen_us(usp, 0, us);
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
__nvvm_atom_sys_cas_gen_us(usp, 0, us);
#endif

// CHECK: ret
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());

setMinCmpXchgSizeInBits(32);
setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
}
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2076,6 +2076,12 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64

// atom_cas

def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
Expand All @@ -2089,6 +2095,14 @@ def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;

defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
atomic_cmp_swap_i32_g, i32imm>;
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
Expand Down Expand Up @@ -2300,6 +2314,7 @@ multiclass ATOM2_incdec_impl<string OpStr> {

// atom.cas
multiclass ATOM3_cas_impl<string OpStr> {
defm _b16 : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>;
defm _b32 : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
defm _b64 : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
bool hasImageHandles() const;
Expand Down
140 changes: 50 additions & 90 deletions llvm/test/CodeGen/NVPTX/atomics-sm90.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
;
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-NEXT: .reg .pred %p<5>;
; CHECKPTX71-NEXT: .reg .b16 %rs<34>;
; CHECKPTX71-NEXT: .reg .b32 %r<4>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
; CHECKPTX71-NEXT: // %bb.0:
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start9
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start2
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end8
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start27
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end1
; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start8
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end26
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start45
; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end7
; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start14
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end44
; CHECKPTX71-NEXT: ret;
; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end13
; CHECKPTX71-NEXT: ret;
%r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
Expand Down
Loading
Loading