77namespace stan {
88namespace math {
99
10- // Internal macro used to modify global pointer definition to the
11- // global AD instance.
12- #ifdef STAN_THREADS
13- // Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14- // explained below we use the GNU compiler extension __thread if
15- // supported by the compiler while the generic thread_local C++11
16- // keyword is used otherwise.
17- #ifdef __GNUC__
18- #define STAN_THREADS_DEF __thread
19- #else
20- #define STAN_THREADS_DEF thread_local
21- #endif
22- #else
23- // In case STAN_THREADS is not set, then no modifier is needed.
24- #define STAN_THREADS_DEF
25- #endif
26-
2710/* *
28- * This struct always provides access to the autodiff stack using
29- * the singleton pattern. Read warnings below!
30- *
31- * The singleton <code>instance_</code> is a global static pointer,
32- * which is thread local (TLS) if the STAN_THREADS preprocess variable
33- * is defined.
11+ * Provides a thread_local singleton if needed. Read warnings below!
12+ * For performance reasons the singleton is a global static for the
13+ * case of no threading which is returned by a function. This design
14+ * should allow the compiler to apply necessary inlining to get
15+ * maximal performance. However, this design suffers from "the static
16+ * init order fiasco"[0]. Anywhere this is used, we must be
17+ * absolutely positive that it doesn't matter when the singleton will
18+ * get initialized relative to other static variables. In exchange,
19+ * we get a more performant singleton pattern for the non-threading
20+ * case. In the threading case we use the defacto standard C++11
21+ * singleton pattern relying on a function wrapping a static local
22+ * variable. This standard pattern is expected to be well supported
23+ * by the major compilers (as its standard), but it does incur some
24+ * performance penalty. There has been some discussion on this; see
25+ * [1] and [2] and the discussions those PRs link to as well.
3426 *
35- * The use of a pointer is motivated by performance reasons for the
36- * threading case. When a TLS is used, initialization with a constant
37- * expression at compile time is required for fast access to the
38- * TLS. As the autodiff storage struct is non-POD, its initialization
39- * is a dynamic expression at compile time. These dynamic expressions
40- * are wrapped, in the TLS case, by a TLS wrapper function which slows
41- * down its access. Using a pointer instead allows to initialize at
42- * compile time to <code>nullptr</code>, which is a compile time
43- * constant. In this case, the compiler avoids the use of a TLS
44- * wrapper function.
45- *
46- * For performance reasons we use the __thread keyword on compilers
47- * which support it. The __thread keyword is a GNU compiler-specific
48- * (gcc, clang, Intel) extension which requires initialization with a
49- * compile time constant expression. The C++11 keyword thread_local
50- * does allow for constant and dynamic initialization of the
51- * TLS. Thus, only the __thread keyword gurantees that constant
52- * initialization and it's implied speedup, is used.
53- *
54- * The initialzation of the AD instance at run-time is handled by the
55- * lifetime of a AutodiffStackSingleton object. More specifically, the
56- * first instance of the AutodiffStackSingleton object will initialize
57- * the AD instance and take ownership (it is the only one instance
58- * with the private member own_instance_ being true). Thus, whenever
59- * the first instance of the AutodiffStackSingleton object gets
60- * destructed, the AD tape will be destructed as well. Within
61- * stan-math the initialization of the AD instance for the main thread
62- * of the program is handled by instantiating the singleton once in
63- * the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64- * then all created child threads must instantiate a
65- * AutodiffStackSingleton object within the child thread before
66- * accessing the AD system in order to initialize the TLS AD tape
67- * within the child thread.
68- *
69- * The design of a globally held (optionally TLS) pointer, which is
70- * globally initialized, allows the compiler to apply necessary
71- * inlining to get maximal performance. However, the design suffers
72- * from "the static init order fiasco"[0]. Whenever the static init
73- * order fiasco occurs, the C++ client of the library may instantiate
74- * a AutodiffStackSingleton object at the adequate code position prior
75- * to any AD tape access to ensure proper initialization order. In
76- * exchange, we get a more performant singleton pattern with automatic
77- * initialization of the AD stack for the main thread. There has been
78- * some discussion on earlier designs using the Mayer singleton
79- * approach; see [1] and [2] and the discussions those PRs link to as
80- * well.
27+ * These are thread_local only if the user asks for it with
28+ * -DSTAN_THREADS. This is primarily because Apple clang compilers
29+ * before 2016 don't support thread_local and the additional
30+ * performance cost. We have proposed removing support for those[3],
31+ * and at that time we should evaluate the performance of a switch to
32+ * thread_local. If there is no loss in performance, we can remove
33+ * this ifdef.
8134 *
8235 * [0] https://isocpp.org/wiki/faq/ctors#static-init-order
8336 * [1] https://github.com/stan-dev/math/pull/840
8437 * [2] https://github.com/stan-dev/math/pull/826
8538 * [3]
8639 * http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87- * [4] https://github.com/stan-dev/math/pull/1135
8840 */
8941template <typename ChainableT, typename ChainableAllocT>
9042struct AutodiffStackSingleton {
9143 typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
9244 AutodiffStackSingleton_t;
9345
94- AutodiffStackSingleton () : own_instance_(init()) {}
95- ~AutodiffStackSingleton () {
96- if (own_instance_) {
97- delete instance_;
98- instance_ = nullptr ;
99- }
100- }
101-
10246 struct AutodiffStackStorage {
10347 AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
10448
@@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
11357 std::vector<size_t > nested_var_alloc_stack_starts_;
11458 };
11559
60+ AutodiffStackSingleton () = delete ;
11661 explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
11762 AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
11863
119- static inline constexpr AutodiffStackStorage &instance () {
120- return *instance_;
64+ static inline AutodiffStackStorage &instance () {
65+ #ifdef STAN_THREADS
66+ thread_local static AutodiffStackStorage instance_;
67+ #endif
68+ return instance_;
12169 }
12270
123- private:
124- static bool init () {
125- if (!instance_) {
126- instance_ = new AutodiffStackStorage ();
127- return true ;
128- }
129- return false ;
130- }
71+ #ifndef STAN_THREADS
13172
132- static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133- const bool own_instance_;
73+ private:
74+ static AutodiffStackStorage instance_;
75+ #endif
13476};
13577
78+ #ifndef STAN_THREADS
13679template <typename ChainableT, typename ChainableAllocT>
137- STAN_THREADS_DEF
138- typename AutodiffStackSingleton<ChainableT,
139- ChainableAllocT>::AutodiffStackStorage
140- *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141- = nullptr ;
80+ typename AutodiffStackSingleton<ChainableT,
81+ ChainableAllocT>::AutodiffStackStorage
82+ AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83+ #endif
14284
14385} // namespace math
14486} // namespace stan
0 commit comments