Skip to content

Commit 7412caf

Browse files
committed
Add support for standard Rust atomic compareexchange intrinsic.
1 parent 33664c0 commit 7412caf

File tree

4 files changed

+222
-10
lines changed

4 files changed

+222
-10
lines changed

crates/rustc_codegen_nvvm/src/builder.rs

Lines changed: 146 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,16 +1134,60 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11341134
// Atomic Operations
11351135
fn atomic_cmpxchg(
11361136
&mut self,
1137-
_dst: &'ll Value,
1138-
_cmp: &'ll Value,
1139-
_src: &'ll Value,
1140-
_order: AtomicOrdering,
1141-
_failure_order: AtomicOrdering,
1142-
_weak: bool,
1137+
dst: &'ll Value,
1138+
cmp: &'ll Value,
1139+
src: &'ll Value,
1140+
order: AtomicOrdering,
1141+
failure_order: AtomicOrdering,
1142+
weak: bool,
11431143
) -> (&'ll Value, &'ll Value) {
1144-
// allowed but only for some things and with restrictions
1145-
// https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146-
self.fatal("atomic cmpxchg is not supported")
1144+
// LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145+
match (order,failure_order){
1146+
(AtomicOrdering::SeqCst, _)=>(),
1147+
(_, AtomicOrdering::Relaxed)=>(),
1148+
(AtomicOrdering::Release, AtomicOrdering::Release) | (AtomicOrdering::Release, AtomicOrdering::Acquire) | (AtomicOrdering::Acquire, AtomicOrdering::Acquire)=>(),
1149+
(AtomicOrdering::AcqRel,AtomicOrdering::Acquire) => (),
1150+
(AtomicOrdering::Relaxed, _) | (_, AtomicOrdering::Release | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst)=>{
1151+
// Invalid cmpxchg - failure order stronger than order!
1152+
self.abort();
1153+
return (self.const_undef(self.val_ty(cmp)),self.const_undef(self.type_i1()));
1154+
}
1155+
};
1156+
let res = self.atomic_op(
1157+
dst,
1158+
|builder, dst| {
1159+
unsafe {
1160+
llvm::LLVMRustBuildAtomicCmpXchg(
1161+
builder.llbuilder,
1162+
dst,
1163+
cmp,
1164+
src,
1165+
crate::llvm::AtomicOrdering::from_generic( order),
1166+
crate::llvm::AtomicOrdering::from_generic(failure_order),
1167+
weak as u32,
1168+
)
1169+
}
1170+
},
1171+
|builder, dst| {
1172+
// Local space is only accessible to the current thread.
1173+
// So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1174+
let load:&'ll Value = unsafe{ llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
1175+
let compare = builder.icmp(IntPredicate::IntEQ, load, cmp);
1176+
// We can do something smart & branchless here:
1177+
// We select either the current value(if the comparison fails), or a new value.
1178+
// We then *undconditionally* write that back to local memory(which is very, very cheap).
1179+
let value = builder.select(compare, src, load);
1180+
unsafe { llvm::LLVMBuildStore(builder.llbuilder, value, dst)};
1181+
let res_type = builder.type_struct(&[builder.val_ty(cmp),builder.type_ix(1)], false);
1182+
let res = builder.const_undef(res_type); // insert_value
1183+
let res = builder.insert_value(res, load, 0);
1184+
let res = builder.insert_value(res, compare, 1);
1185+
res
1186+
},
1187+
);
1188+
let val = self.extract_value(res, 0);
1189+
let success = self.extract_value(res, 1);
1190+
(val, success)
11471191
}
11481192
fn atomic_rmw(
11491193
&mut self,
@@ -1609,3 +1653,96 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
16091653
}
16101654
}
16111655
}
1656+
impl<'ll, 'tcx, 'a> Builder<'a, 'll, 'tcx> {
1657+
fn atomic_op(
1658+
&mut self,
1659+
dst: &'ll Value,
1660+
atomic_supported: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
1661+
emulate_local: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
1662+
) -> &'ll Value {
1663+
// (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1664+
// For example, they are restricted in what address space they operate on.
1665+
// CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1666+
// An atomic instruction can soundly operate on:
1667+
// 1. The global address space
1668+
// 2. The shared(cluster) address space.
1669+
// It can't operate on:
1670+
// 1. The const address space(atomics on consts are UB anyway)
1671+
// 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1672+
// So, we do the following:
1673+
// 1. Check if the pointer is in one of the address spaces atomics support.
1674+
// a) if so, we perform an atomic operation
1675+
// 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1676+
// **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1677+
// 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1678+
1679+
// We check if the `dst` pointer is in the `global` address space.
1680+
let (isspacep_global_ty, isspacep_global_fn) =
1681+
self.get_intrinsic("llvm.nvvm.isspacep.global");
1682+
let isspacep_global = self.call(
1683+
isspacep_global_ty,
1684+
None,
1685+
None,
1686+
isspacep_global_fn,
1687+
&[dst],
1688+
None,
1689+
None,
1690+
);
1691+
// We check if the `dst` pointer is in the `shared` address space.
1692+
let (isspacep_shared_ty, isspacep_shared_fn) =
1693+
self.get_intrinsic("llvm.nvvm.isspacep.shared");
1694+
let isspacep_shared = self.call(
1695+
isspacep_shared_ty,
1696+
None,
1697+
None,
1698+
isspacep_shared_fn,
1699+
&[dst],
1700+
None,
1701+
None,
1702+
);
1703+
// Combine those to check if we are in a supported address space.
1704+
let atomic_supported_addrspace = self.or(isspacep_shared, isspacep_global);
1705+
// We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1706+
let supported_bb = self.append_sibling_block("atomic_space_supported");
1707+
let unsupported_bb = self.append_sibling_block("atomic_space_unsupported");
1708+
self.cond_br(atomic_supported_addrspace, supported_bb, unsupported_bb);
1709+
// We also create a "merge" block we will jump to, after the the atomic ops finish.
1710+
let merge_bb = self.append_sibling_block("atomic_op_done");
1711+
// Execute atomic op if supported, then jump to merge
1712+
self.switch_to_block(supported_bb);
1713+
let supported_res = atomic_supported(self, dst);
1714+
self.br(merge_bb);
1715+
// Check if the pointer is in the thread space. If so, we can emulate it.
1716+
self.switch_to_block(unsupported_bb);
1717+
let (isspacep_local_ty, isspacep_local_fn) = self.get_intrinsic("llvm.nvvm.isspacep.local");
1718+
let isspacep_local = self.call(
1719+
isspacep_local_ty,
1720+
None,
1721+
None,
1722+
isspacep_local_fn,
1723+
&[dst],
1724+
None,
1725+
None,
1726+
);
1727+
let local_bb = self.append_sibling_block("atomic_local_space");
1728+
let atomic_ub_bb = self.append_sibling_block("atomic_space_ub");
1729+
self.cond_br(isspacep_local, local_bb, atomic_ub_bb);
1730+
// The pointer is in the thread(local) space.
1731+
self.switch_to_block(local_bb);
1732+
let local_res = emulate_local(self, dst);
1733+
self.br(merge_bb);
1734+
// The pointer is neither in the supported address space, nor the local space.
1735+
// This is very likely UB. So, we trap here.
1736+
// TODO: should we print some kind of a message here? NVVM supports printf.
1737+
self.switch_to_block(atomic_ub_bb);
1738+
self.abort();
1739+
self.unreachable();
1740+
// Atomic is impl has finished, and we can now switch to the merge_bb
1741+
self.switch_to_block(merge_bb);
1742+
self.phi(
1743+
self.val_ty(local_res),
1744+
&[supported_res, local_res],
1745+
&[supported_bb, local_bb],
1746+
)
1747+
}
1748+
}

crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,5 +449,11 @@ impl<'ll> CodegenCx<'ll, '_> {
449449
"__nv_ynf",
450450
fn(t_i32, t_f32) -> t_f32
451451
);
452+
// Address space checks
453+
ifn!(map, "lvm.nvvm.isspacep.const", fn(i8p) -> i1);
454+
ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1);
455+
ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1);
456+
ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1);
457+
452458
}
453459
}

crates/rustc_codegen_nvvm/src/llvm.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// but likely will use in the future, so we ignore any unused functions
1717
// in case we need them in the future for things like debug info or LTO.
1818
#![allow(dead_code)]
19-
19+
use rustc_codegen_ssa::common::AtomicRmwBinOp;
2020
use libc::{c_char, c_uint, c_void, size_t};
2121
use libc::{c_int, c_ulonglong};
2222
use std::ffi::{CStr, CString};
@@ -1947,4 +1947,50 @@ unsafe extern "C" {
19471947
pub(crate) fn LLVMRustAddDereferenceableOrNullAttr(Fn: &Value, index: c_uint, bytes: u64);
19481948

19491949
pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
1950+
// Atomics
1951+
pub fn LLVMRustBuildAtomicCmpXchg<'a>(
1952+
B: &Builder<'a>,
1953+
LHS: &Value,
1954+
CMP: &Value,
1955+
RHS: &Value,
1956+
Order: AtomicOrdering,
1957+
FailureOrder: AtomicOrdering,
1958+
Weak: Bool,
1959+
) -> &'a Value;
1960+
1961+
pub fn LLVMBuildAtomicRMW<'a>(
1962+
B: &Builder<'a>,
1963+
Op: AtomicRmwBinOp,
1964+
LHS: &Value,
1965+
RHS: &Value,
1966+
Order: AtomicOrdering,
1967+
SingleThreaded: Bool,
1968+
) -> &'a Value;
19501969
}
1970+
/// LLVMAtomicOrdering
1971+
#[derive(Copy, Clone)]
1972+
#[repr(C)]
1973+
pub(crate) enum AtomicOrdering {
1974+
#[allow(dead_code)]
1975+
NotAtomic = 0,
1976+
#[allow(dead_code)]
1977+
Unordered = 1,
1978+
Monotonic = 2,
1979+
// Consume = 3, // Not specified yet.
1980+
Acquire = 4,
1981+
Release = 5,
1982+
AcquireRelease = 6,
1983+
SequentiallyConsistent = 7,
1984+
}
1985+
impl AtomicOrdering {
1986+
pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self {
1987+
use rustc_middle::ty::AtomicOrdering as Common;
1988+
match ao {
1989+
Common::Relaxed => Self::Monotonic,
1990+
Common::Acquire => Self::Acquire,
1991+
Common::Release => Self::Release,
1992+
Common::AcqRel => Self::AcquireRelease,
1993+
Common::SeqCst => Self::SequentiallyConsistent,
1994+
}
1995+
}
1996+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// Test CUDA atomic operations compile correctly
2+
// build-pass
3+
// compile-flags: -Z verify-llvm-ir
4+
use core::sync::atomic::{AtomicUsize,Ordering};
5+
6+
use cuda_std::atomic::{
7+
AtomicF32, AtomicF64, BlockAtomicF32, BlockAtomicF64, SystemAtomicF32, SystemAtomicF64,
8+
};
9+
use cuda_std::kernel;
10+
static GLOBAL:AtomicUsize = AtomicUsize::new(0);
11+
#[kernel]
12+
pub unsafe fn test_cuda_atomic_floats() {
13+
let local = AtomicUsize::new(0);
14+
// `compare_exchange` should succeed
15+
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
16+
// `compare_exchange` should fail
17+
local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
18+
// `compare_exchange` should succeed
19+
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
20+
// `compare_exchange` should fail
21+
GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
22+
23+
}

0 commit comments

Comments
 (0)