Skip to content

Commit 2d1fba6

Browse files
DenisGZMDenis GerasimovgonzalobgDenis.Gerasimov
authored
[NVPTX] Support __usAtomicCAS builtin (#99646)
Supported `__usAtomicCAS` builtin originally defined in `/usr/local/cuda/inlcude/crt/sm_70_rt.hpp` --------- Co-authored-by: Denis Gerasimov <[email protected]> Co-authored-by: Gonzalo Brito Gadeschi <[email protected]> Co-authored-by: Denis.Gerasimov <[email protected]>
1 parent c43190f commit 2d1fba6

File tree

10 files changed

+319
-92
lines changed

10 files changed

+319
-92
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,9 @@ BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n")
844844
TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
845845
TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
846846

847+
TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
848+
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
849+
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
847850
BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n")
848851
TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60)
849852
TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60)

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20344,6 +20344,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2034420344
case NVPTX::BI__nvvm_atom_min_gen_ull:
2034520345
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
2034620346

20347+
case NVPTX::BI__nvvm_atom_cas_gen_us:
2034720348
case NVPTX::BI__nvvm_atom_cas_gen_i:
2034820349
case NVPTX::BI__nvvm_atom_cas_gen_l:
2034920350
case NVPTX::BI__nvvm_atom_cas_gen_ll:
@@ -20535,6 +20536,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2053520536
case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
2053620537
case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
2053720538
return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
20539+
case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
2053820540
case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
2053920541
case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
2054020542
case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
@@ -20546,6 +20548,7 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
2054620548
Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
2054720549
{Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
2054820550
}
20551+
case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
2054920552
case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
2055020553
case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
2055120554
case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {

clang/lib/Headers/__clang_cuda_device_functions.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,20 @@ __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
529529
__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
530530
__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
531531
__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
532+
__DEVICE__ unsigned short
533+
__usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) {
534+
return __nvvm_atom_cas_gen_us(__p, __cmp, __v);
535+
}
536+
__DEVICE__ unsigned short __usAtomicCAS_block(unsigned short *__p,
537+
unsigned short __cmp,
538+
unsigned short __v) {
539+
return __nvvm_atom_cta_cas_gen_us(__p, __cmp, __v);
540+
}
541+
__DEVICE__ unsigned short __usAtomicCAS_system(unsigned short *__p,
542+
unsigned short __cmp,
543+
unsigned short __v) {
544+
return __nvvm_atom_sys_cas_gen_us(__p, __cmp, __v);
545+
}
532546
__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
533547
return __nvvm_atom_add_gen_i((int *)__p, __v);
534548
}

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
// REQUIRES: nvptx-registered-target
2+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
3+
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
4+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
25
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
36
// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
47
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
@@ -238,7 +241,8 @@ __shared__ long long sll;
238241

239242
// Check for atomic intrinsics
240243
// CHECK-LABEL: nvvm_atom
241-
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
244+
__device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
245+
unsigned short *usp, unsigned short us, int *ip,
242246
int i, unsigned int *uip, unsigned ui, long *lp,
243247
long l, long long *llp, long long ll) {
244248
// CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
@@ -577,6 +581,16 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
577581
__nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
578582
#endif
579583

584+
#if __CUDA_ARCH__ >= 700
585+
// CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
586+
// CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
587+
__nvvm_atom_cas_gen_us(usp, 0, us);
588+
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
589+
__nvvm_atom_cta_cas_gen_us(usp, 0, us);
590+
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
591+
__nvvm_atom_sys_cas_gen_us(usp, 0, us);
592+
#endif
593+
580594
// CHECK: ret
581595
}
582596

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
890890
// actions
891891
computeRegisterProperties(STI.getRegisterInfo());
892892

893-
setMinCmpXchgSizeInBits(32);
893+
setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
894894
setMaxAtomicSizeInBitsSupported(64);
895895
setMaxDivRemBitWidthSupported(64);
896896
}

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2076,6 +2076,12 @@ defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64
20762076

20772077
// atom_cas
20782078

2079+
def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
2080+
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2081+
def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
2082+
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
2083+
def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
2084+
(atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
20792085
def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
20802086
(atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
20812087
def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
@@ -2089,6 +2095,14 @@ def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
20892095
def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
20902096
(atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
20912097

2098+
defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2099+
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
2100+
defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
2101+
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
2102+
defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
2103+
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
2104+
defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
2105+
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
20922106
defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
20932107
atomic_cmp_swap_i32_g, i32imm>;
20942108
defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
@@ -2300,6 +2314,7 @@ multiclass ATOM2_incdec_impl<string OpStr> {
23002314

23012315
// atom.cas
23022316
multiclass ATOM3_cas_impl<string OpStr> {
2317+
defm _b16 : ATOM3S_impl<OpStr, "i", "b16", i16, Int16Regs, i16imm, imm, i16, []>;
23032318
defm _b32 : ATOM3S_impl<OpStr, "i", "b32", i32, Int32Regs, i32imm, imm, i32, []>;
23042319
defm _b64 : ATOM3S_impl<OpStr, "i", "b64", i64, Int64Regs, i64imm, imm, i64, []>;
23052320
}

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
7777
bool hasAtomScope() const { return SmVersion >= 60; }
7878
bool hasAtomBitwise64() const { return SmVersion >= 32; }
7979
bool hasAtomMinMax64() const { return SmVersion >= 32; }
80+
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
8081
bool hasLDG() const { return SmVersion >= 32; }
8182
bool hasHWROT32() const { return SmVersion >= 32; }
8283
bool hasImageHandles() const;

llvm/test/CodeGen/NVPTX/atomics-sm90.ll

Lines changed: 50 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -45,102 +45,62 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
4545
;
4646
; CHECKPTX71-LABEL: test(
4747
; CHECKPTX71: {
48-
; CHECKPTX71-NEXT: .reg .pred %p<5>;
49-
; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
50-
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
51-
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
48+
; CHECKPTX71-NEXT: .reg .pred %p<5>;
49+
; CHECKPTX71-NEXT: .reg .b16 %rs<34>;
50+
; CHECKPTX71-NEXT: .reg .b32 %r<4>;
51+
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
5252
; CHECKPTX71-EMPTY:
5353
; CHECKPTX71-NEXT: // %bb.0:
54-
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
55-
; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
56-
; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
57-
; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
58-
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
59-
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
60-
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
61-
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
62-
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
63-
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
64-
; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
65-
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
66-
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
54+
; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
55+
; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
56+
; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
57+
; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
58+
; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
59+
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
60+
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start
6761
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
68-
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
69-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
70-
; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
71-
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
72-
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
73-
; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
74-
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
75-
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
76-
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
77-
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
78-
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
79-
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
80-
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
81-
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
82-
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
83-
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start9
62+
; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
63+
; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
64+
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
65+
; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
66+
; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
67+
; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
68+
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
69+
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end
70+
; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
71+
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start2
8472
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
85-
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
86-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
87-
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
88-
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
89-
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
90-
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
91-
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
92-
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
93-
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
94-
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
95-
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
96-
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
97-
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
98-
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end8
99-
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
100-
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
101-
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
102-
; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
103-
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
104-
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
105-
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start27
73+
; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
74+
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
75+
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
76+
; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
77+
; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
78+
; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
79+
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
80+
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end1
81+
; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
82+
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start8
10683
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
107-
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
108-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
109-
; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
110-
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
111-
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
112-
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
113-
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
114-
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
115-
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
116-
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
117-
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
118-
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
119-
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
120-
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end26
121-
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
122-
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
123-
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
124-
; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
125-
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
126-
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
127-
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start45
84+
; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
85+
; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
86+
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
87+
; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
88+
; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
89+
; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
90+
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
91+
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end7
92+
; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
93+
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start14
12894
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
129-
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
130-
; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
131-
; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
132-
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
133-
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
134-
; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
135-
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
136-
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
137-
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
138-
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
139-
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
140-
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
141-
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
142-
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end44
143-
; CHECKPTX71-NEXT: ret;
95+
; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
96+
; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
97+
; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
98+
; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
99+
; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
100+
; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
101+
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
102+
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end13
103+
; CHECKPTX71-NEXT: ret;
144104
%r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
145105
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
146106
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst

0 commit comments

Comments
 (0)