Skip to content

Commit dd9c1da

Browse files
committed
opal: enable load-linked, store-conditional atomics for AArch64
This PR updates the opal atomic code to allow the use of the AArch64 LL/SC instructions even when C11 atomics are enabled. This should provide for better atomic lifo/fifo performance on these systems. Performance on Apple Silicon (Late 2021 Mac Mini M1, 16GB): LL/SC: ``` Mac-mini:class hjelmn$ ./opal_lifo -t 1 Single thread test. Time: 0 s 13621 us 13 nsec/poppush Atomics thread finished. Time: 0 s 14375 us 14 nsec/poppush Atomics thread finished. Time: 0 s 154525 us 154 nsec/poppush Atomics thread finished. Time: 0 s 154661 us 154 nsec/poppush Atomics thread finished. Time: 0 s 156505 us 156 nsec/poppush Atomics thread finished. Time: 0 s 157013 us 157 nsec/poppush Atomics thread finished. Time: 0 s 157493 us 157 nsec/poppush Atomics thread finished. Time: 0 s 158275 us 158 nsec/poppush Atomics thread finished. Time: 0 s 158647 us 158 nsec/poppush Atomics thread finished. Time: 0 s 158973 us 158 nsec/poppush All threads finished. Thread count: 8 Time: 0 s 159023 us 19 nsec/poppush SUPPORT: OMPI Test Passed: opal_lifo_t: (7 tests) ``` ``` Mac-mini:class hjelmn$ ./opal_fifo Single thread test. Time: 0 s 7620 us 7 nsec/poppush Atomics thread finished. Time: 0 s 7918 us 7 nsec/poppush Atomics thread finished. Time: 0 s 76081 us 76 nsec/poppush Atomics thread finished. Time: 0 s 79458 us 79 nsec/poppush Atomics thread finished. Time: 0 s 84994 us 84 nsec/poppush Atomics thread finished. Time: 0 s 90103 us 90 nsec/poppush Atomics thread finished. Time: 0 s 90403 us 90 nsec/poppush Atomics thread finished. Time: 0 s 91280 us 91 nsec/poppush Atomics thread finished. Time: 0 s 92466 us 92 nsec/poppush Atomics thread finished. Time: 0 s 93835 us 93 nsec/poppush All threads finished. Thread count: 8 Time: 0 s 93916 us 11 nsec/poppush Exhaustive atomics thread finished. Popped 821530 items. Time: 0 s 107912 us 131 nsec/poppush Exhaustive atomics thread finished. Popped 810445 items. Time: 0 s 114695 us 141 nsec/poppush Exhaustive atomics thread finished. Popped 806449 items. Time: 0 s 116241 us 144 nsec/poppush Exhaustive atomics thread finished. Popped 813960 items. Time: 0 s 117182 us 143 nsec/poppush Exhaustive atomics thread finished. Popped 825230 items. Time: 0 s 118810 us 143 nsec/poppush Exhaustive atomics thread finished. Popped 826685 items. Time: 0 s 119486 us 144 nsec/poppush Exhaustive atomics thread finished. Popped 828373 items. Time: 0 s 120327 us 145 nsec/poppush Exhaustive atomics thread finished. Popped 830266 items. Time: 0 s 121114 us 145 nsec/poppush All threads finished. Thread count: 8 Time: 0 s 121186 us 15 nsec/poppush SUPPORT: OMPI Test Passed: opal_fifo_t: (8 tests) ``` CAS128: ``` Mac-mini:class hjelmn$ ./opal_lifo -t 1 Single thread test. Time: 0 s 25688 us 25 nsec/poppush Atomics thread finished. Time: 0 s 29322 us 29 nsec/poppush Atomics thread finished. Time: 4 s 57595 us 4057 nsec/poppush Atomics thread finished. Time: 4 s 151568 us 4151 nsec/poppush Atomics thread finished. Time: 4 s 162332 us 4162 nsec/poppush Atomics thread finished. Time: 4 s 173651 us 4173 nsec/poppush Atomics thread finished. Time: 4 s 176088 us 4176 nsec/poppush Atomics thread finished. Time: 4 s 178025 us 4178 nsec/poppush Atomics thread finished. Time: 4 s 178713 us 4178 nsec/poppush Atomics thread finished. Time: 4 s 178760 us 4178 nsec/poppush All threads finished. Thread count: 8 Time: 4 s 178830 us 522 nsec/poppush SUPPORT: OMPI Test Passed: opal_lifo_t: (7 tests) ``` ``` Mac-mini:class hjelmn$ ./opal_fifo Single thread test. Time: 0 s 7611 us 7 nsec/poppush Atomics thread finished. Time: 0 s 19256 us 19 nsec/poppush Atomics thread finished. Time: 2 s 555095 us 2555 nsec/poppush Atomics thread finished. Time: 2 s 562521 us 2562 nsec/poppush Atomics thread finished. Time: 2 s 570284 us 2570 nsec/poppush Atomics thread finished. Time: 2 s 570760 us 2570 nsec/poppush Atomics thread finished. Time: 2 s 571438 us 2571 nsec/poppush Atomics thread finished. Time: 2 s 573642 us 2573 nsec/poppush Atomics thread finished. Time: 2 s 575019 us 2575 nsec/poppush Atomics thread finished. Time: 2 s 575161 us 2575 nsec/poppush All threads finished. Thread count: 8 Time: 2 s 575231 us 321 nsec/poppush Exhaustive atomics thread finished. Popped 639525 items. Time: 1 s 828167 us 2858 nsec/poppush Exhaustive atomics thread finished. Popped 642578 items. Time: 1 s 840312 us 2863 nsec/poppush Exhaustive atomics thread finished. Popped 641617 items. Time: 1 s 846852 us 2878 nsec/poppush Exhaustive atomics thread finished. Popped 639283 items. Time: 1 s 849705 us 2893 nsec/poppush Exhaustive atomics thread finished. Popped 646423 items. Time: 1 s 851183 us 2863 nsec/poppush Exhaustive atomics thread finished. Popped 645146 items. Time: 1 s 851750 us 2870 nsec/poppush Exhaustive atomics thread finished. Popped 645428 items. Time: 1 s 852076 us 2869 nsec/poppush Exhaustive atomics thread finished. Popped 648267 items. Time: 1 s 852240 us 2857 nsec/poppush All threads finished. Thread count: 8 Time: 1 s 852359 us 231 nsec/poppush SUPPORT: OMPI Test Passed: opal_fifo_t: (8 tests) ``` About a 40x performance with the multi-threaded lifo/fifo tests. These are artificial benchmarks but give a reasonable idea of how these structures perform under heavy contention. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 3f02faf commit dd9c1da

File tree

9 files changed

+71
-92
lines changed

9 files changed

+71
-92
lines changed

opal/class/opal_fifo.h

+8-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
* Copyright (c) 2010 IBM Corporation. All rights reserved.
1515
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
1616
* reseved.
17+
* Copyright (c) 2021 Triad National Security, LLC. All rights reserved.
18+
* Copyright (c) 2021 Google, LLC. All rights reserved.
1719
* $COPYRIGHT$
1820
*
1921
* Additional copyrights may follow
@@ -76,7 +78,7 @@ static inline bool opal_fifo_is_empty( opal_fifo_t* fifo )
7678
return opal_fifo_head (fifo) == &fifo->opal_fifo_ghost;
7779
}
7880

79-
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128
81+
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128 && !OPAL_HAVE_ATOMIC_LLSC_PTR
8082

8183
/* Add one element to the FIFO. We will return the last head of the list
8284
* to allow the upper level to detect if this element is the first one in the
@@ -121,8 +123,12 @@ static inline opal_list_item_t *opal_fifo_pop_atomic (opal_fifo_t *fifo)
121123
opal_counted_pointer_t head, tail;
122124

123125
opal_read_counted_pointer (&fifo->opal_fifo_head, &head);
124-
126+
int attempts = 0;
125127
do {
128+
129+
if (++attempts == 5) {
130+
_opal_lifo_release_cpu ();
131+
}
126132
tail.value = fifo->opal_fifo_tail.value;
127133
opal_atomic_rmb ();
128134

opal/class/opal_lifo.h

+5-3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
* reseved.
1717
* Copyright (c) 2016-2018 Research Organization for Information Science
1818
* and Technology (RIST). All rights reserved.
19+
* Copyright (c) 2021 Triad National Security, LLC. All rights reserved.
20+
* Copyright (c) 2021 Google, LLC. All rights reserved.
1921
* $COPYRIGHT$
2022
*
2123
* Additional copyrights may follow
@@ -50,7 +52,7 @@ union opal_counted_pointer_t {
5052
/** list item pointer */
5153
volatile opal_atomic_intptr_t item;
5254
} data;
53-
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128 && HAVE_OPAL_INT128_T
55+
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128 && HAVE_OPAL_INT128_T && !OPAL_HAVE_ATOMIC_LLSC_PTR
5456
/** used for atomics when there is a cmpset that can operate on
5557
* two 64-bit values */
5658
opal_atomic_int128_t atomic_value;
@@ -60,7 +62,7 @@ union opal_counted_pointer_t {
6062
typedef union opal_counted_pointer_t opal_counted_pointer_t;
6163

6264

63-
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128
65+
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128 && !OPAL_HAVE_ATOMIC_LLSC_PTR
6466

6567
/* Add one element to the FIFO. We will return the last head of the list
6668
* to allow the upper level to detect if this element is the first one in the
@@ -136,7 +138,7 @@ static inline bool opal_lifo_is_empty( opal_lifo_t* lifo )
136138
}
137139

138140

139-
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128
141+
#if OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_128 && !OPAL_HAVE_ATOMIC_LLSC_PTR
140142

141143
/* Add one element to the LIFO. We will return the last head of the list
142144
* to allow the upper level to detect if this element is the first one in the

opal/include/opal/sys/arm64/Makefile.am

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
# University of Stuttgart. All rights reserved.
1010
# Copyright (c) 2004-2005 The Regents of the University of California.
1111
# All rights reserved.
12+
# Copyright (c) 2021 Triad National Security, LLC. All rights reserved.
13+
# Copyright (c) 2021 Google, LLC. All rights reserved.
1214
# $COPYRIGHT$
1315
#
1416
# Additional copyrights may follow
@@ -20,5 +22,6 @@
2022

2123
headers += \
2224
opal/sys/arm64/atomic.h \
25+
opal/sys/arm64/atomic_llsc.h \
2326
opal/sys/arm64/timer.h
2427

opal/include/opal/sys/arm64/atomic.h

+4-54
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,29 @@
1414
* Copyright (c) 2010 ARM ltd. All rights reserved.
1515
* Copyright (c) 2016-2018 Los Alamos National Security, LLC. All rights
1616
* reserved.
17+
* Copyright (c) 2021 Triad National Security, LLC. All rights reserved.
18+
* Copyright (c) 2021 Google, LLC. All rights reserved.
1719
* $COPYRIGHT$
1820
*
1921
* Additional copyrights may follow
2022
*
2123
* $HEADER$
2224
*/
2325

26+
#include "atomic_llsc.h"
27+
2428
#if !defined(OPAL_SYS_ARCH_ATOMIC_H)
2529

2630
#define OPAL_SYS_ARCH_ATOMIC_H 1
2731

2832
#if OPAL_GCC_INLINE_ASSEMBLY
2933

3034
#define OPAL_HAVE_ATOMIC_MEM_BARRIER 1
31-
#define OPAL_HAVE_ATOMIC_LLSC_32 1
3235
#define OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_32 1
3336
#define OPAL_HAVE_ATOMIC_SWAP_32 1
3437
#define OPAL_HAVE_ATOMIC_MATH_32 1
3538
#define OPAL_HAVE_ATOMIC_COMPARE_EXCHANGE_64 1
3639
#define OPAL_HAVE_ATOMIC_SWAP_64 1
37-
#define OPAL_HAVE_ATOMIC_LLSC_64 1
3840
#define OPAL_HAVE_ATOMIC_ADD_32 1
3941
#define OPAL_HAVE_ATOMIC_AND_32 1
4042
#define OPAL_HAVE_ATOMIC_OR_32 1
@@ -162,32 +164,6 @@ static inline bool opal_atomic_compare_exchange_strong_rel_32 (opal_atomic_int32
162164
return ret;
163165
}
164166

165-
#define opal_atomic_ll_32(addr, ret) \
166-
do { \
167-
opal_atomic_int32_t *_addr = (addr); \
168-
int32_t _ret; \
169-
\
170-
__asm__ __volatile__ ("ldaxr %w0, [%1] \n" \
171-
: "=&r" (_ret) \
172-
: "r" (_addr)); \
173-
\
174-
ret = (typeof(ret)) _ret; \
175-
} while (0)
176-
177-
#define opal_atomic_sc_32(addr, newval, ret) \
178-
do { \
179-
opal_atomic_int32_t *_addr = (addr); \
180-
int32_t _newval = (int32_t) newval; \
181-
int _ret; \
182-
\
183-
__asm__ __volatile__ ("stlxr %w0, %w2, [%1] \n" \
184-
: "=&r" (_ret) \
185-
: "r" (_addr), "r" (_newval) \
186-
: "cc", "memory"); \
187-
\
188-
ret = (_ret == 0); \
189-
} while (0)
190-
191167
static inline bool opal_atomic_compare_exchange_strong_64 (opal_atomic_int64_t *addr, int64_t *oldval, int64_t newval)
192168
{
193169
int64_t prev;
@@ -272,32 +248,6 @@ static inline bool opal_atomic_compare_exchange_strong_rel_64 (opal_atomic_int64
272248
return ret;
273249
}
274250

275-
#define opal_atomic_ll_64(addr, ret) \
276-
do { \
277-
opal_atomic_int64_t *_addr = (addr); \
278-
int64_t _ret; \
279-
\
280-
__asm__ __volatile__ ("ldaxr %0, [%1] \n" \
281-
: "=&r" (_ret) \
282-
: "r" (_addr)); \
283-
\
284-
ret = (typeof(ret)) _ret; \
285-
} while (0)
286-
287-
#define opal_atomic_sc_64(addr, newval, ret) \
288-
do { \
289-
opal_atomic_int64_t *_addr = (addr); \
290-
int64_t _newval = (int64_t) newval; \
291-
int _ret; \
292-
\
293-
__asm__ __volatile__ ("stlxr %w0, %2, [%1] \n" \
294-
: "=&r" (_ret) \
295-
: "r" (_addr), "r" (_newval) \
296-
: "cc", "memory"); \
297-
\
298-
ret = (_ret == 0); \
299-
} while (0)
300-
301251
#define OPAL_ASM_MAKE_ATOMIC(type, bits, name, inst, reg) \
302252
static inline type opal_atomic_fetch_ ## name ## _ ## bits (opal_atomic_ ## type *addr, type value) \
303253
{ \

opal/include/opal/sys/atomic.h

+48-6
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,19 @@
5959
#include "opal/sys/architecture.h"
6060
#include "opal_stdatomic.h"
6161

62-
#if OPAL_ASSEMBLY_BUILTIN == OPAL_BUILTIN_C11 && !defined(__INTEL_COMPILER)
63-
64-
#include "atomic_stdc.h"
65-
66-
#else /* !OPAL_C_HAVE__ATOMIC */
67-
6862
/* do some quick #define cleanup in cases where we are doing
6963
testing... */
7064
#ifdef OPAL_DISABLE_INLINE_ASM
7165
#undef OPAL_C_GCC_INLINE_ASSEMBLY
7266
#define OPAL_C_GCC_INLINE_ASSEMBLY 0
7367
#endif
7468

69+
#if OPAL_ASSEMBLY_BUILTIN == OPAL_BUILTIN_C11 && !defined(__INTEL_COMPILER)
70+
71+
#include "atomic_stdc.h"
72+
73+
#else /* !OPAL_C_HAVE__ATOMIC */
74+
7575
/* define OPAL_{GCC,DEC,XLC}_INLINE_ASSEMBLY based on the
7676
OPAL_C_{GCC,DEC,XLC}_INLINE_ASSEMBLY defines and whether we
7777
are in C or C++ */
@@ -642,6 +642,48 @@ static inline intptr_t opal_atomic_fetch_sub_ptr( opal_atomic_intptr_t* addr, vo
642642

643643
#endif /* !OPAL_C_HAVE__ATOMIC */
644644

645+
/****** load-linked, store-conditional atomic implementations ******/
646+
647+
/* C11 atomics do not expose the low-level load-linked, store-conditional
648+
* instructions. Open MPI can use these instructions to implement a more
649+
* efficient version of the lock-free lifo and fifo. On Apple Silicon the
650+
* LL/SC fifo and lifo are ~ 2-20x faster than the CAS128 implementation. */
651+
#if OPAL_ASSEMBLY_ARCH == OPAL_ARM64
652+
#include "opal/sys/arm64/atomic_llsc.h"
653+
#endif
654+
655+
#if !defined(OPAL_HAVE_ATOMIC_LLSC_32)
656+
#define OPAL_HAVE_ATOMIC_LLSC_32 0
657+
#endif
658+
659+
#if !defined(OPAL_HAVE_ATOMIC_LLSC_64)
660+
#define OPAL_HAVE_ATOMIC_LLSC_64 0
661+
#endif
662+
663+
#if (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64)
664+
665+
#if SIZEOF_VOID_P == 4 && OPAL_HAVE_ATOMIC_LLSC_32
666+
667+
#define opal_atomic_ll_ptr(addr, ret) opal_atomic_ll_32((opal_atomic_int32_t *) (addr), ret)
668+
#define opal_atomic_sc_ptr(addr, value, ret) opal_atomic_sc_32((opal_atomic_int32_t *) (addr), (intptr_t) (value), ret)
669+
670+
#define OPAL_HAVE_ATOMIC_LLSC_PTR 1
671+
672+
#elif SIZEOF_VOID_P == 8 && OPAL_HAVE_ATOMIC_LLSC_64
673+
674+
#define opal_atomic_ll_ptr(addr, ret) opal_atomic_ll_64((opal_atomic_int64_t *) (addr), ret)
675+
#define opal_atomic_sc_ptr(addr, value, ret) opal_atomic_sc_64((opal_atomic_int64_t *) (addr), (intptr_t) (value), ret)
676+
677+
#define OPAL_HAVE_ATOMIC_LLSC_PTR 1
678+
679+
#endif
680+
681+
#else
682+
683+
#define OPAL_HAVE_ATOMIC_LLSC_PTR 0
684+
685+
#endif /* (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64)*/
686+
645687
END_C_DECLS
646688

647689
#endif /* OPAL_SYS_ATOMIC_H */

opal/include/opal/sys/atomic_impl.h

-20
Original file line numberDiff line numberDiff line change
@@ -304,26 +304,6 @@ OPAL_ATOMIC_DEFINE_CMPXCG_PTR_XX(_rel_)
304304

305305
#endif /* (OPAL_HAVE_ATOMIC_SWAP_32 || OPAL_HAVE_ATOMIC_SWAP_64) */
306306

307-
#if (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64)
308-
309-
#if SIZEOF_VOID_P == 4 && OPAL_HAVE_ATOMIC_LLSC_32
310-
311-
#define opal_atomic_ll_ptr(addr, ret) opal_atomic_ll_32((opal_atomic_int32_t *) (addr), ret)
312-
#define opal_atomic_sc_ptr(addr, value, ret) opal_atomic_sc_32((opal_atomic_int32_t *) (addr), (intptr_t) (value), ret)
313-
314-
#define OPAL_HAVE_ATOMIC_LLSC_PTR 1
315-
316-
#elif SIZEOF_VOID_P == 8 && OPAL_HAVE_ATOMIC_LLSC_64
317-
318-
#define opal_atomic_ll_ptr(addr, ret) opal_atomic_ll_64((opal_atomic_int64_t *) (addr), ret)
319-
#define opal_atomic_sc_ptr(addr, value, ret) opal_atomic_sc_64((opal_atomic_int64_t *) (addr), (intptr_t) (value), ret)
320-
321-
#define OPAL_HAVE_ATOMIC_LLSC_PTR 1
322-
323-
#endif
324-
325-
#endif /* (OPAL_HAVE_ATOMIC_LLSC_32 || OPAL_HAVE_ATOMIC_LLSC_64)*/
326-
327307
#if !defined(OPAL_HAVE_ATOMIC_LLSC_PTR)
328308
#define OPAL_HAVE_ATOMIC_LLSC_PTR 0
329309
#endif

opal/include/opal/sys/atomic_stdc.h

-4
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@
5252
#define OPAL_HAVE_ATOMIC_XOR_64 1
5353
#define OPAL_HAVE_ATOMIC_SUB_64 1
5454

55-
#define OPAL_HAVE_ATOMIC_LLSC_32 0
56-
#define OPAL_HAVE_ATOMIC_LLSC_64 0
57-
#define OPAL_HAVE_ATOMIC_LLSC_PTR 0
58-
5955
#define OPAL_HAVE_ATOMIC_MIN_32 1
6056
#define OPAL_HAVE_ATOMIC_MAX_32 1
6157

test/class/opal_fifo.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ static void *thread_test_exhaust (opal_object_t *arg) {
107107

108108
static bool check_fifo_consistency (opal_fifo_t *fifo, int expected_count)
109109
{
110-
volatile opal_list_item_t *volatile item;
110+
opal_list_item_t * item;
111111
int count;
112112

113-
for (count = 0, item = fifo->opal_fifo_head.data.item ; item != &fifo->opal_fifo_ghost ;
113+
for (count = 0, item = (opal_list_item_t *) fifo->opal_fifo_head.data.item ; item != &fifo->opal_fifo_ghost ;
114114
item = opal_list_get_next(item), count++);
115115

116116
return count == expected_count;

test/class/opal_lifo.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ static bool check_lifo_consistency (opal_lifo_t *lifo, int expected_count)
7070
opal_list_item_t *item;
7171
int count;
7272

73-
for (count = 0, item = lifo->opal_lifo_head.data.item ; item != &lifo->opal_lifo_ghost ;
73+
for (count = 0, item = (opal_list_item_t *) lifo->opal_lifo_head.data.item ; item != &lifo->opal_lifo_ghost ;
7474
item = opal_list_get_next(item), count++);
7575

7676
return count == expected_count;

0 commit comments

Comments
 (0)