From 13a4e0496758677acb0f4cd90cc752dc89a9945f Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Thu, 18 Mar 2021 13:24:21 -0400 Subject: [PATCH 1/3] Powerpc atomics: Force usage of powerpc assembly. The builtins used by default on Power have been shown to perform poorly. For the time being, force all compilers to use the inline assembly until atomic builtins catch-up. This changes the defaults for all compilers sans xl, including: gcc, clang, and pgi to use the assembly. Previously, all of the above were using C11 or the gcc builtins. Bonus: Add a configure flag to force Power machines to use the builtins/C11, depending on what is available. This will make future testing easier. Signed-off-by: Austen Lauria (cherry picked from commit e3f3c5bd3eff2890d3ea993ef1e4443ebfb86a0c) --- config/opal_config_asm.m4 | 12 ++++++++++++ config/opal_configure_options.m4 | 7 +++++++ 2 files changed, 19 insertions(+) diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4 index aed2139a88b..bd135c25a8a 100644 --- a/config/opal_config_asm.m4 +++ b/config/opal_config_asm.m4 @@ -1192,6 +1192,18 @@ AC_DEFUN([OPAL_CONFIG_ASM],[ AC_MSG_ERROR([Could not determine PowerPC word size: $ac_cv_sizeof_long]) fi OPAL_GCC_INLINE_ASSIGN='"1: li %0,0" : "=&r"(ret)' + + # See the following github PR and some performance numbers/discussion: + # https://github.com/open-mpi/ompi/pull/8649 + AC_MSG_CHECKING([$opal_cv_asm_arch: Checking if force gcc atomics requested]) + if test $force_gcc_atomics_ppc = 0 ; then + AC_MSG_RESULT([no]) + opal_cv_asm_builtin="BUILTIN_NO" + else + AC_MSG_RESULT([Yes]) + AC_MSG_WARN([$opal_cv_asm_arch: gcc atomics have been known to perform poorly on powerpc.]) + fi + ;; *) if test $opal_cv_have___atomic = "yes" ; then diff --git a/config/opal_configure_options.m4 b/config/opal_configure_options.m4 index 78dc6ba413c..734e91dadfb 100644 --- a/config/opal_configure_options.m4 +++ b/config/opal_configure_options.m4 @@ -84,6 +84,13 @@ else WANT_BRANCH_PROBABILITIES=0 fi +AC_ARG_ENABLE([builtin-atomics-for-ppc],[AS_HELP_STRING([--enable-builtin-atomics-for-ppc], + [POWER architectures only: Force use of builtin atomics if available. This could either be gcc builtins or C11 atomics, depending on what is available on your system. Enabling this is known to cause poor performance in atomic operations on Power machines. (default: disabled)])]) +if test "x$enable_builtin_atomics_for_ppc" = "xyes" ; then +force_gcc_atomics_ppc=1 +else +force_gcc_atomics_ppc=0 +fi # # Memory debugging From 81f1489fd02ac77a3cd68050ac6cde941471ba5d Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Tue, 23 Mar 2021 14:59:33 -0400 Subject: [PATCH 2/3] Add chosen atomics to the configury output summary. Signed-off-by: Austen Lauria (cherry picked from commit f85e421571766cbf9c925d5d7b0e88495b6d3cfe) --- config/opal_config_asm.m4 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/opal_config_asm.m4 b/config/opal_config_asm.m4 index bd135c25a8a..da52039b01c 100644 --- a/config/opal_config_asm.m4 +++ b/config/opal_config_asm.m4 @@ -1297,6 +1297,8 @@ int main(int argc, char* argv[]) [Whether to use builtin atomics]) AC_SUBST([OPAL_ASSEMBLY_BUILTIN]) + OPAL_SUMMARY_ADD([[Atomics]],[[OMPI]],[],[$opal_cv_asm_builtin]) + OPAL_ASM_FIND_FILE unset result asm_format From 6985eef9a744c6de0e10dc70aade3c49c26f9684 Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Mon, 22 Mar 2021 11:32:34 -0400 Subject: [PATCH 3/3] Add benchmark for atomics calls. Similar to class/opal_lifo/fifo, but more granular to get a better idea of what is going on. Code was borrowed from those tests to make this one. Signed-off-by: Austen Lauria (cherry picked from commit 136213dcf8f4e687dcb4861ffc2a41235d4dc03b) --- test/threads/Makefile.am | 12 +- test/threads/opal_atomic_thread_bench.c | 237 ++++++++++++++++++++++++ 2 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 test/threads/opal_atomic_thread_bench.c diff --git a/test/threads/Makefile.am b/test/threads/Makefile.am index 024b4f3ae0c..ffac218740d 100644 --- a/test/threads/Makefile.am +++ b/test/threads/Makefile.am @@ -10,7 +10,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2021 IBM Corporation. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -24,7 +25,8 @@ AM_LDFLAGS = -lpthread check_PROGRAMS = \ opal_thread \ - opal_condition + opal_condition \ + opal_atomic_thread_bench # JMS possibly to be re-added when #1232 is fixed #TESTS = $(check_PROGRAMS) @@ -42,5 +44,11 @@ opal_condition_LDADD = \ $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la opal_condition_DEPENDENCIES = $(opal_condition_LDADD) +opal_atomic_thread_bench_SOURCES = opal_atomic_thread_bench.c +opal_atomic_thread_bench_LDADD = \ + $(top_builddir)/test/support/libsupport.a \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la +opal_atomic_thread_bench_DEPENDENCIES = $(opal_atomic_thread_bench_LDADD) + distclean: rm -rf *.dSYM .deps .libs *.log *.o *.trs $(check_PROGRAMS) Makefile diff --git a/test/threads/opal_atomic_thread_bench.c b/test/threads/opal_atomic_thread_bench.c new file mode 100644 index 00000000000..2989be55dba --- /dev/null +++ b/test/threads/opal_atomic_thread_bench.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2021 IBM Corporation. All rights reserved. + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include + +#include "support.h" +#include "opal/runtime/opal.h" +#include "opal/constants.h" +#include "opal/mca/threads/threads.h" +#include "opal/mca/threads/condition.h" +#include "opal/sys/atomic.h" + + +#define OPAL_TEST_THREAD_COUNT 8 +#define ITERATIONS 1000000 +#define ITEM_COUNT 100 + +static opal_atomic_int64_t var_64 = 0; +static opal_atomic_int32_t var_32 = 0; +static pthread_barrier_t barrier; + +#if !defined(timersub) +#define timersub(a, b, r) \ + do { \ + (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + if ((a)->tv_usec < (b)->tv_usec) { \ + (r)->tv_sec--; \ + (a)->tv_usec += 1000000; \ + } \ + (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + } while (0) +#endif + + +#if !defined(OPAL_TEST_DONE) +#define OPAL_TEST_DONE(func, val) { \ + gettimeofday (&stop, NULL); \ + timersub(&stop, &start, &total); \ + timing = ((double) total.tv_sec + (double) total.tv_usec * 1e-6) / (double) ITERATIONS; \ + printf ("%s() thread finished. Time: %d s %d us %d nsec/per\n", func, (int) total.tv_sec, \ + (int)total.tv_usec, (int)(timing / 1e-9)); \ + memset(&stop, 0, sizeof(struct timeval)); \ + memset(&start, 0, sizeof(struct timeval)); \ + memset(&total, 0, sizeof(struct timeval)); \ + /* printf("%ld\n", val); */ \ + fflush(stdout); \ + pthread_barrier_wait (&barrier); \ +} +#endif + +#if !defined(OPAL_RESET_VAR) +#define OPAL_RESET_VAR(var) { \ + var = 0; \ + pthread_barrier_wait (&barrier); \ +} +#endif + +static void *thread_test (void *arg) { + struct timeval start, stop, total; + double timing; + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_64(&var_64, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_rel_64(&var_64, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_rel_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_acq_64(&var_64, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_acq_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_add_64(&var_64, 1); + } + OPAL_TEST_DONE("opal_atomic_fetch_add_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_sub_64(&var_64, 1); + } + OPAL_TEST_DONE("opal_atomic_fetch_sub_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_xor_64(&var_64, i); + } + OPAL_TEST_DONE("opal_atomic_fetch_xor_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_swap_64(&var_64, i); + } + OPAL_TEST_DONE("opal_atomic_swap_64", var_64); + + OPAL_RESET_VAR(var_64); + +#if OPAL_HAVE_ATOMIC_LLSC_64 + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + int ret; + opal_atomic_sc_64(&var_64, i, ret); + } + OPAL_TEST_DONE("opal_atomic_sc_64", var_64); + + OPAL_RESET_VAR(var_64); + + gettimeofday (&start, NULL); + for (int64_t i = 0 ; i < ITERATIONS ; ++i) { + int ret; + opal_atomic_sc_64(&var_64, i, ret); + } + OPAL_TEST_DONE("opal_atomic_ll_64", var_64); + + OPAL_RESET_VAR(var_64); +#endif + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_32(&var_32, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_rel_32(&var_32, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_rel_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_compare_exchange_strong_acq_32(&var_32, &i, i+1); + } + OPAL_TEST_DONE("opal_atomic_compare_exchange_strong_acq_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_add_32(&var_32, 1); + } + OPAL_TEST_DONE("opal_atomic_fetch_add_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_sub_32(&var_32, 1); + } + OPAL_TEST_DONE("opal_atomic_fetch_sub_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_fetch_xor_32(&var_32, i); + } + OPAL_TEST_DONE("opal_atomic_fetch_xor_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + opal_atomic_swap_32(&var_32, i); + } + OPAL_TEST_DONE("opal_atomic_swap_32", var_32); + + OPAL_RESET_VAR(var_32); + +#if OPAL_HAVE_ATOMIC_LLSC_32 + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + int ret; + opal_atomic_sc_32(&var_32, i, ret); + } + OPAL_TEST_DONE("opal_atomic_sc_32", var_32); + + OPAL_RESET_VAR(var_32); + + gettimeofday (&start, NULL); + for (int32_t i = 0 ; i < ITERATIONS ; ++i) { + int ret; + opal_atomic_sc_32(&var_32, i, ret); + } + OPAL_TEST_DONE("opal_atomic_ll_32", var_32); + + OPAL_RESET_VAR(var_32); +#endif + + return NULL; +} + +int main(void) { + + pthread_barrier_init (&barrier, NULL, OPAL_TEST_THREAD_COUNT); + + pthread_t ts[OPAL_TEST_THREAD_COUNT]; + for(int i = 0; i < OPAL_TEST_THREAD_COUNT; i++) { + pthread_create(&ts[i], NULL, &thread_test, NULL); + } + + for(int i = 0; i < OPAL_TEST_THREAD_COUNT; i++) { + pthread_join(ts[i], NULL); + } + return 0; +}