@@ -1134,16 +1134,60 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
1134
1134
// Atomic Operations
1135
1135
fn atomic_cmpxchg (
1136
1136
& mut self ,
1137
- _dst : & ' ll Value ,
1138
- _cmp : & ' ll Value ,
1139
- _src : & ' ll Value ,
1140
- _order : AtomicOrdering ,
1141
- _failure_order : AtomicOrdering ,
1142
- _weak : bool ,
1137
+ dst : & ' ll Value ,
1138
+ cmp : & ' ll Value ,
1139
+ src : & ' ll Value ,
1140
+ order : AtomicOrdering ,
1141
+ failure_order : AtomicOrdering ,
1142
+ weak : bool ,
1143
1143
) -> ( & ' ll Value , & ' ll Value ) {
1144
- // allowed but only for some things and with restrictions
1145
- // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146
- self . fatal ( "atomic cmpxchg is not supported" )
1144
+ // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145
+ match ( order, failure_order) {
1146
+ ( AtomicOrdering :: SeqCst , _) =>( ) ,
1147
+ ( _, AtomicOrdering :: Relaxed ) =>( ) ,
1148
+ ( AtomicOrdering :: Release , AtomicOrdering :: Release ) | ( AtomicOrdering :: Release , AtomicOrdering :: Acquire ) | ( AtomicOrdering :: Acquire , AtomicOrdering :: Acquire ) =>( ) ,
1149
+ ( AtomicOrdering :: AcqRel , AtomicOrdering :: Acquire ) => ( ) ,
1150
+ ( AtomicOrdering :: Relaxed , _) | ( _, AtomicOrdering :: Release | AtomicOrdering :: AcqRel | AtomicOrdering :: SeqCst ) =>{
1151
+ // Invalid cmpxchg - failure order stronger than order!
1152
+ self . abort ( ) ;
1153
+ return ( self . const_undef ( self . val_ty ( cmp) ) , self . const_undef ( self . type_i1 ( ) ) ) ;
1154
+ }
1155
+ } ;
1156
+ let res = self . atomic_op (
1157
+ dst,
1158
+ |builder, dst| {
1159
+ unsafe {
1160
+ llvm:: LLVMRustBuildAtomicCmpXchg (
1161
+ builder. llbuilder ,
1162
+ dst,
1163
+ cmp,
1164
+ src,
1165
+ crate :: llvm:: AtomicOrdering :: from_generic ( order) ,
1166
+ crate :: llvm:: AtomicOrdering :: from_generic ( failure_order) ,
1167
+ weak as u32 ,
1168
+ )
1169
+ }
1170
+ } ,
1171
+ |builder, dst| {
1172
+ // Local space is only accessible to the current thread.
1173
+ // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1174
+ let load: & ' ll Value = unsafe { llvm:: LLVMBuildLoad ( builder. llbuilder , dst, UNNAMED ) } ;
1175
+ let compare = builder. icmp ( IntPredicate :: IntEQ , load, cmp) ;
1176
+ // We can do something smart & branchless here:
1177
+ // We select either the current value(if the comparison fails), or a new value.
1178
+ // We then *undconditionally* write that back to local memory(which is very, very cheap).
1179
+ let value = builder. select ( compare, src, load) ;
1180
+ unsafe { llvm:: LLVMBuildStore ( builder. llbuilder , value, dst) } ;
1181
+ let res_type = builder. type_struct ( & [ builder. val_ty ( cmp) , builder. type_ix ( 1 ) ] , false ) ;
1182
+ let res = builder. const_undef ( res_type) ; // insert_value
1183
+ let res = builder. insert_value ( res, load, 0 ) ;
1184
+ let res = builder. insert_value ( res, compare, 1 ) ;
1185
+ res
1186
+ } ,
1187
+ ) ;
1188
+ let val = self . extract_value ( res, 0 ) ;
1189
+ let success = self . extract_value ( res, 1 ) ;
1190
+ ( val, success)
1147
1191
}
1148
1192
fn atomic_rmw (
1149
1193
& mut self ,
@@ -1609,3 +1653,96 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
1609
1653
}
1610
1654
}
1611
1655
}
1656
+ impl < ' ll , ' tcx , ' a > Builder < ' a , ' ll , ' tcx > {
1657
+ fn atomic_op (
1658
+ & mut self ,
1659
+ dst : & ' ll Value ,
1660
+ atomic_supported : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1661
+ emulate_local : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1662
+ ) -> & ' ll Value {
1663
+ // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1664
+ // For example, they are restricted in what address space they operate on.
1665
+ // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1666
+ // An atomic instruction can soundly operate on:
1667
+ // 1. The global address space
1668
+ // 2. The shared(cluster) address space.
1669
+ // It can't operate on:
1670
+ // 1. The const address space(atomics on consts are UB anyway)
1671
+ // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1672
+ // So, we do the following:
1673
+ // 1. Check if the pointer is in one of the address spaces atomics support.
1674
+ // a) if so, we perform an atomic operation
1675
+ // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1676
+ // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1677
+ // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1678
+
1679
+ // We check if the `dst` pointer is in the `global` address space.
1680
+ let ( isspacep_global_ty, isspacep_global_fn) =
1681
+ self . get_intrinsic ( "llvm.nvvm.isspacep.global" ) ;
1682
+ let isspacep_global = self . call (
1683
+ isspacep_global_ty,
1684
+ None ,
1685
+ None ,
1686
+ isspacep_global_fn,
1687
+ & [ dst] ,
1688
+ None ,
1689
+ None ,
1690
+ ) ;
1691
+ // We check if the `dst` pointer is in the `shared` address space.
1692
+ let ( isspacep_shared_ty, isspacep_shared_fn) =
1693
+ self . get_intrinsic ( "llvm.nvvm.isspacep.shared" ) ;
1694
+ let isspacep_shared = self . call (
1695
+ isspacep_shared_ty,
1696
+ None ,
1697
+ None ,
1698
+ isspacep_shared_fn,
1699
+ & [ dst] ,
1700
+ None ,
1701
+ None ,
1702
+ ) ;
1703
+ // Combine those to check if we are in a supported address space.
1704
+ let atomic_supported_addrspace = self . or ( isspacep_shared, isspacep_global) ;
1705
+ // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1706
+ let supported_bb = self . append_sibling_block ( "atomic_space_supported" ) ;
1707
+ let unsupported_bb = self . append_sibling_block ( "atomic_space_unsupported" ) ;
1708
+ self . cond_br ( atomic_supported_addrspace, supported_bb, unsupported_bb) ;
1709
+ // We also create a "merge" block we will jump to, after the the atomic ops finish.
1710
+ let merge_bb = self . append_sibling_block ( "atomic_op_done" ) ;
1711
+ // Execute atomic op if supported, then jump to merge
1712
+ self . switch_to_block ( supported_bb) ;
1713
+ let supported_res = atomic_supported ( self , dst) ;
1714
+ self . br ( merge_bb) ;
1715
+ // Check if the pointer is in the thread space. If so, we can emulate it.
1716
+ self . switch_to_block ( unsupported_bb) ;
1717
+ let ( isspacep_local_ty, isspacep_local_fn) = self . get_intrinsic ( "llvm.nvvm.isspacep.local" ) ;
1718
+ let isspacep_local = self . call (
1719
+ isspacep_local_ty,
1720
+ None ,
1721
+ None ,
1722
+ isspacep_local_fn,
1723
+ & [ dst] ,
1724
+ None ,
1725
+ None ,
1726
+ ) ;
1727
+ let local_bb = self . append_sibling_block ( "atomic_local_space" ) ;
1728
+ let atomic_ub_bb = self . append_sibling_block ( "atomic_space_ub" ) ;
1729
+ self . cond_br ( isspacep_local, local_bb, atomic_ub_bb) ;
1730
+ // The pointer is in the thread(local) space.
1731
+ self . switch_to_block ( local_bb) ;
1732
+ let local_res = emulate_local ( self , dst) ;
1733
+ self . br ( merge_bb) ;
1734
+ // The pointer is neither in the supported address space, nor the local space.
1735
+ // This is very likely UB. So, we trap here.
1736
+ // TODO: should we print some kind of a message here? NVVM supports printf.
1737
+ self . switch_to_block ( atomic_ub_bb) ;
1738
+ self . abort ( ) ;
1739
+ self . unreachable ( ) ;
1740
+ // Atomic is impl has finished, and we can now switch to the merge_bb
1741
+ self . switch_to_block ( merge_bb) ;
1742
+ self . phi (
1743
+ self . val_ty ( local_res) ,
1744
+ & [ supported_res, local_res] ,
1745
+ & [ supported_bb, local_bb] ,
1746
+ )
1747
+ }
1748
+ }
0 commit comments