Skip to content

Commit a80705a

Browse files
authored
Merge pull request #8710 from awlauria/ppc_atomics_v5.0.x
v5.0.x: Powerpc atomics: Force usage of powerpc assembly.
2 parents 38030ce + 6985eef commit a80705a

File tree

4 files changed

+268
-2
lines changed

4 files changed

+268
-2
lines changed

config/opal_config_asm.m4

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,18 @@ AC_DEFUN([OPAL_CONFIG_ASM],[
11911191
AC_MSG_ERROR([Could not determine PowerPC word size: $ac_cv_sizeof_long])
11921192
fi
11931193
OPAL_GCC_INLINE_ASSIGN='"1: li %0,0" : "=&r"(ret)'
1194+
1195+
# See the following github PR and some performance numbers/discussion:
1196+
# https://github.com/open-mpi/ompi/pull/8649
1197+
AC_MSG_CHECKING([$opal_cv_asm_arch: Checking if force gcc atomics requested])
1198+
if test $force_gcc_atomics_ppc = 0 ; then
1199+
AC_MSG_RESULT([no])
1200+
opal_cv_asm_builtin="BUILTIN_NO"
1201+
else
1202+
AC_MSG_RESULT([Yes])
1203+
AC_MSG_WARN([$opal_cv_asm_arch: gcc atomics have been known to perform poorly on powerpc.])
1204+
fi
1205+
11941206
;;
11951207
*)
11961208
if test $opal_cv_have___atomic = "yes" ; then
@@ -1284,6 +1296,8 @@ int main(int argc, char* argv[])
12841296
[Whether to use builtin atomics])
12851297
AC_SUBST([OPAL_ASSEMBLY_BUILTIN])
12861298
1299+
OPAL_SUMMARY_ADD([[Atomics]],[[OMPI]],[],[$opal_cv_asm_builtin])
1300+
12871301
OPAL_ASM_FIND_FILE
12881302
12891303
unset result asm_format

config/opal_configure_options.m4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ else
8484
WANT_BRANCH_PROBABILITIES=0
8585
fi
8686

87+
AC_ARG_ENABLE([builtin-atomics-for-ppc],[AS_HELP_STRING([--enable-builtin-atomics-for-ppc],
88+
[POWER architectures only: Force use of builtin atomics if available. This could either be gcc builtins or C11 atomics, depending on what is available on your system. Enabling this is known to cause poor performance in atomic operations on Power machines. (default: disabled)])])
89+
if test "x$enable_builtin_atomics_for_ppc" = "xyes" ; then
90+
force_gcc_atomics_ppc=1
91+
else
92+
force_gcc_atomics_ppc=0
93+
fi
8794

8895
#
8996
# Memory debugging

test/threads/Makefile.am

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
14+
# Copyright (c) 2021 IBM Corporation. All rights reserved.
1415
# $COPYRIGHT$
1516
#
1617
# Additional copyrights may follow
@@ -24,7 +25,8 @@ AM_LDFLAGS = -lpthread
2425

2526
check_PROGRAMS = \
2627
opal_thread \
27-
opal_condition
28+
opal_condition \
29+
opal_atomic_thread_bench
2830

2931
# JMS possibly to be re-added when #1232 is fixed
3032
#TESTS = $(check_PROGRAMS)
@@ -42,5 +44,11 @@ opal_condition_LDADD = \
4244
$(top_builddir)/opal/lib@[email protected]
4345
opal_condition_DEPENDENCIES = $(opal_condition_LDADD)
4446

47+
opal_atomic_thread_bench_SOURCES = opal_atomic_thread_bench.c
48+
opal_atomic_thread_bench_LDADD = \
49+
$(top_builddir)/test/support/libsupport.a \
50+
$(top_builddir)/opal/lib@[email protected]
51+
opal_atomic_thread_bench_DEPENDENCIES = $(opal_atomic_thread_bench_LDADD)
52+
4553
distclean:
4654
rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
/*
2+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
3+
* Additional copyrights may follow
4+
*
5+
* $HEADER$
6+
*/
7+
8+
#include "opal_config.h"
9+
10+
#include <stdio.h>
11+
#include <time.h>
12+
13+
#include "support.h"
14+
#include "opal/runtime/opal.h"
15+
#include "opal/constants.h"
16+
#include "opal/mca/threads/threads.h"
17+
#include "opal/mca/threads/condition.h"
18+
#include "opal/sys/atomic.h"
19+
20+
21+
#define OPAL_TEST_THREAD_COUNT 8
22+
#define ITERATIONS 1000000
23+
#define ITEM_COUNT 100
24+
25+
static opal_atomic_int64_t var_64 = 0;
26+
static opal_atomic_int32_t var_32 = 0;
27+
static pthread_barrier_t barrier;
28+
29+
#if !defined(timersub)
30+
#define timersub(a, b, r) \
31+
do { \
32+
(r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
33+
if ((a)->tv_usec < (b)->tv_usec) { \
34+
(r)->tv_sec--; \
35+
(a)->tv_usec += 1000000; \
36+
} \
37+
(r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
38+
} while (0)
39+
#endif
40+
41+
42+
#if !defined(OPAL_TEST_DONE)
43+
#define OPAL_TEST_DONE(func, val) { \
44+
gettimeofday (&stop, NULL); \
45+
timersub(&stop, &start, &total); \
46+
timing = ((double) total.tv_sec + (double) total.tv_usec * 1e-6) / (double) ITERATIONS; \
47+
printf ("%s() thread finished. Time: %d s %d us %d nsec/per\n", func, (int) total.tv_sec, \
48+
(int)total.tv_usec, (int)(timing / 1e-9)); \
49+
memset(&stop, 0, sizeof(struct timeval)); \
50+
memset(&start, 0, sizeof(struct timeval)); \
51+
memset(&total, 0, sizeof(struct timeval)); \
52+
/* printf("%ld\n", val); */ \
53+
fflush(stdout); \
54+
pthread_barrier_wait (&barrier); \
55+
}
56+
#endif
57+
58+
#if !defined(OPAL_RESET_VAR)
59+
#define OPAL_RESET_VAR(var) { \
60+
var = 0; \
61+
pthread_barrier_wait (&barrier); \
62+
}
63+
#endif
64+
65+
static void *thread_test (void *arg) {
66+
struct timeval start, stop, total;
67+
double timing;
68+
69+
gettimeofday (&start, NULL);
70+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
71+
opal_atomic_compare_exchange_strong_64(&var_64, &i, i+1);
72+
}
73+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_64", var_64);
74+
75+
OPAL_RESET_VAR(var_64);
76+
77+
gettimeofday (&start, NULL);
78+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
79+
opal_atomic_compare_exchange_strong_rel_64(&var_64, &i, i+1);
80+
}
81+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_rel_64", var_64);
82+
83+
OPAL_RESET_VAR(var_64);
84+
85+
gettimeofday (&start, NULL);
86+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
87+
opal_atomic_compare_exchange_strong_acq_64(&var_64, &i, i+1);
88+
}
89+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_acq_64", var_64);
90+
91+
OPAL_RESET_VAR(var_64);
92+
93+
gettimeofday (&start, NULL);
94+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
95+
opal_atomic_fetch_add_64(&var_64, 1);
96+
}
97+
OPAL_TEST_DONE("opal_atomic_fetch_add_64", var_64);
98+
99+
OPAL_RESET_VAR(var_64);
100+
101+
gettimeofday (&start, NULL);
102+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
103+
opal_atomic_fetch_sub_64(&var_64, 1);
104+
}
105+
OPAL_TEST_DONE("opal_atomic_fetch_sub_64", var_64);
106+
107+
OPAL_RESET_VAR(var_64);
108+
109+
gettimeofday (&start, NULL);
110+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
111+
opal_atomic_fetch_xor_64(&var_64, i);
112+
}
113+
OPAL_TEST_DONE("opal_atomic_fetch_xor_64", var_64);
114+
115+
OPAL_RESET_VAR(var_64);
116+
117+
gettimeofday (&start, NULL);
118+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
119+
opal_atomic_swap_64(&var_64, i);
120+
}
121+
OPAL_TEST_DONE("opal_atomic_swap_64", var_64);
122+
123+
OPAL_RESET_VAR(var_64);
124+
125+
#if OPAL_HAVE_ATOMIC_LLSC_64
126+
gettimeofday (&start, NULL);
127+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
128+
int ret;
129+
opal_atomic_sc_64(&var_64, i, ret);
130+
}
131+
OPAL_TEST_DONE("opal_atomic_sc_64", var_64);
132+
133+
OPAL_RESET_VAR(var_64);
134+
135+
gettimeofday (&start, NULL);
136+
for (int64_t i = 0 ; i < ITERATIONS ; ++i) {
137+
int ret;
138+
opal_atomic_sc_64(&var_64, i, ret);
139+
}
140+
OPAL_TEST_DONE("opal_atomic_ll_64", var_64);
141+
142+
OPAL_RESET_VAR(var_64);
143+
#endif
144+
145+
gettimeofday (&start, NULL);
146+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
147+
opal_atomic_compare_exchange_strong_32(&var_32, &i, i+1);
148+
}
149+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_32", var_32);
150+
151+
OPAL_RESET_VAR(var_32);
152+
153+
gettimeofday (&start, NULL);
154+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
155+
opal_atomic_compare_exchange_strong_rel_32(&var_32, &i, i+1);
156+
}
157+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_rel_32", var_32);
158+
159+
OPAL_RESET_VAR(var_32);
160+
161+
gettimeofday (&start, NULL);
162+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
163+
opal_atomic_compare_exchange_strong_acq_32(&var_32, &i, i+1);
164+
}
165+
OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_acq_32", var_32);
166+
167+
OPAL_RESET_VAR(var_32);
168+
169+
gettimeofday (&start, NULL);
170+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
171+
opal_atomic_fetch_add_32(&var_32, 1);
172+
}
173+
OPAL_TEST_DONE("opal_atomic_fetch_add_32", var_32);
174+
175+
OPAL_RESET_VAR(var_32);
176+
177+
gettimeofday (&start, NULL);
178+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
179+
opal_atomic_fetch_sub_32(&var_32, 1);
180+
}
181+
OPAL_TEST_DONE("opal_atomic_fetch_sub_32", var_32);
182+
183+
OPAL_RESET_VAR(var_32);
184+
185+
gettimeofday (&start, NULL);
186+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
187+
opal_atomic_fetch_xor_32(&var_32, i);
188+
}
189+
OPAL_TEST_DONE("opal_atomic_fetch_xor_32", var_32);
190+
191+
OPAL_RESET_VAR(var_32);
192+
193+
gettimeofday (&start, NULL);
194+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
195+
opal_atomic_swap_32(&var_32, i);
196+
}
197+
OPAL_TEST_DONE("opal_atomic_swap_32", var_32);
198+
199+
OPAL_RESET_VAR(var_32);
200+
201+
#if OPAL_HAVE_ATOMIC_LLSC_32
202+
gettimeofday (&start, NULL);
203+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
204+
int ret;
205+
opal_atomic_sc_32(&var_32, i, ret);
206+
}
207+
OPAL_TEST_DONE("opal_atomic_sc_32", var_32);
208+
209+
OPAL_RESET_VAR(var_32);
210+
211+
gettimeofday (&start, NULL);
212+
for (int32_t i = 0 ; i < ITERATIONS ; ++i) {
213+
int ret;
214+
opal_atomic_sc_32(&var_32, i, ret);
215+
}
216+
OPAL_TEST_DONE("opal_atomic_ll_32", var_32);
217+
218+
OPAL_RESET_VAR(var_32);
219+
#endif
220+
221+
return NULL;
222+
}
223+
224+
int main(void) {
225+
226+
pthread_barrier_init (&barrier, NULL, OPAL_TEST_THREAD_COUNT);
227+
228+
pthread_t ts[OPAL_TEST_THREAD_COUNT];
229+
for(int i = 0; i < OPAL_TEST_THREAD_COUNT; i++) {
230+
pthread_create(&ts[i], NULL, &thread_test, NULL);
231+
}
232+
233+
for(int i = 0; i < OPAL_TEST_THREAD_COUNT; i++) {
234+
pthread_join(ts[i], NULL);
235+
}
236+
return 0;
237+
}

0 commit comments

Comments
 (0)