[libc++] fix `counting_semaphore` lost wakeups #79265

huixie90 · 2024-01-24T08:42:47Z

Fixes #77659
Fixes #46357

Picked up from https://reviews.llvm.org/D114119

llvmbot · 2024-01-24T08:43:19Z

@llvm/pr-subscribers-libcxx

Author: Hui (huixie90)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/79265.diff

3 Files Affected:

(modified) libcxx/include/__atomic/atomic_sync.h (+12-4)
(modified) libcxx/include/semaphore (+15-6)
(added) libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp (+65)

diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 3d20d6a8ce2514..cd500596a73a9c 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -46,12 +46,12 @@ __libcpp_atomic_wait(__cxx_atomic_contention_t const volatile*, __cxx_contention
 template <class _Atp, class _Fn>
 struct __libcpp_atomic_wait_backoff_impl {
   _Atp* __a;
-  _Fn __test_fn;
+  _Fn __test_with_old;
   _LIBCPP_AVAILABILITY_SYNC
   _LIBCPP_HIDE_FROM_ABI bool operator()(chrono::nanoseconds __elapsed) const {
     if (__elapsed > chrono::microseconds(64)) {
-      auto const __monitor = std::__libcpp_atomic_monitor(__a);
-      if (__test_fn())
+      auto __monitor = std::__libcpp_atomic_monitor(__a);
+      if (__test_with_old(__monitor))
         return true;
       std::__libcpp_atomic_wait(__a, __monitor);
     } else if (__elapsed > chrono::microseconds(4))
@@ -62,9 +62,17 @@ struct __libcpp_atomic_wait_backoff_impl {
   }
 };
 
+template <class _Atp, class _Fn, class _Fn2>
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI bool
+__cxx_atomic_wait(_Atp* __a, _Fn&& __test_fn, _Fn2&& __test_with_old) {
+  __libcpp_atomic_wait_backoff_impl<_Atp, __decay_t<_Fn2> > __backoff_fn = {__a, __test_with_old};
+  return std::__libcpp_thread_poll_with_backoff(__test_fn, __backoff_fn);
+}
+
 template <class _Atp, class _Fn>
 _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_wait(_Atp* __a, _Fn&& __test_fn) {
-  __libcpp_atomic_wait_backoff_impl<_Atp, __decay_t<_Fn> > __backoff_fn = {__a, __test_fn};
+  auto __test_with_old = [&](auto&) { return __test_fn(); };
+  __libcpp_atomic_wait_backoff_impl<_Atp, decltype(__test_with_old)> __backoff_fn{__a, __test_with_old};
   return std::__libcpp_thread_poll_with_backoff(__test_fn, __backoff_fn);
 }
 
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index ac3d2d7fe02e88..9c8ca5a6c7fe5a 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -54,6 +54,7 @@ using binary_semaphore = counting_semaphore<1>;
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__atomic/atomic_base.h>
 #include <__atomic/atomic_sync.h>
+#include <__atomic/contention_t.h>
 #include <__atomic/memory_order.h>
 #include <__availability>
 #include <__chrono/time_point.h>
@@ -95,19 +96,22 @@ public:
     _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __update <= _LIBCPP_SEMAPHORE_MAX - __old, "update is greater than the expected value");
 
-    if (__old > 0) {
-      // Nothing to do
-    } else if (__update > 1)
+    if (__old == 0) {
       __a_.notify_all();
-    else
-      __a_.notify_one();
+    }
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void acquire() {
     auto const __test_fn = [this]() -> bool {
       auto __old = __a_.load(memory_order_relaxed);
       return (__old != 0) && __a_.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed);
     };
-    __cxx_atomic_wait(&__a_.__a_, __test_fn);
+    auto const __test_with_old = [this](__cxx_contention_t& __monitor) -> bool {
+      ptrdiff_t __old = __monitor;
+      bool __r        = __try_acquire_impl(__old);
+      __monitor       = static_cast<__cxx_contention_t>(__old);
+      return __r;
+    };
+    __cxx_atomic_wait(&__a_.__a_, __test_fn, __test_with_old);
   }
   template <class _Rep, class _Period>
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI bool
@@ -119,6 +123,11 @@ public:
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI bool try_acquire() {
     auto __old = __a_.load(memory_order_acquire);
+    return __try_acquire_impl(__old);
+  }
+
+private:
+  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY bool __try_acquire_impl(ptrdiff_t& __old) {
     while (true) {
       if (__old == 0)
         return false;
diff --git a/libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp b/libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp
new file mode 100644
index 00000000000000..aa21e93221b311
--- /dev/null
+++ b/libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// This test requires the dylib support introduced in D68480, which shipped in macOS 11.0.
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
+
+// This is a regression test for https://llvm.org/PR47013.
+
+// <semaphore>
+
+#include <barrier>
+#include <semaphore>
+#include <thread>
+#include <vector>
+
+#include "make_test_thread.h"
+
+static std::counting_semaphore<> s(0);
+static std::barrier<> b(8 + 1);
+
+void acquire() {
+  for (int i = 0; i < 10'000; ++i) {
+    s.acquire();
+    b.arrive_and_wait();
+  }
+}
+
+void release() {
+  for (int i = 0; i < 10'000; ++i) {
+    s.release(1);
+    s.release(1);
+    s.release(1);
+    s.release(1);
+
+    s.release(1);
+    s.release(1);
+    s.release(1);
+    s.release(1);
+
+    b.arrive_and_wait();
+  }
+}
+
+int main(int, char**) {
+  for (int run = 0; run < 20; ++run) {
+    std::vector<std::thread> threads;
+    for (int i = 0; i < 8; ++i)
+      threads.push_back(support::make_test_thread(acquire));
+
+    threads.push_back(support::make_test_thread(release));
+
+    for (auto& thread : threads)
+      thread.join();
+  }
+
+  return 0;
+}

ldionne

CC @jiixyj since you did an amazing analysis of this problem a while back, you might be interested.

This seems reasonable to me. I just went over it with @huixie90 and it looks like this should solve two problems described in https://reviews.llvm.org/D114119, but not the last problem described in https://reviews.llvm.org/D114119#3193088 specifically. It's still an improvement over the status quo.

I find the code for __atomic_wait to be really confusing and I wonder whether the __libcpp_thread_poll_with_backoff abstraction actually helps or simply makes things harder to understand. I'd welcome if someone has an appetite for refactoring this part of the code.

libcxx/include/semaphore

libcxx/include/__atomic/atomic_sync.h

jiixyj · 2024-01-31T16:43:45Z

libcxx/include/semaphore

  }
  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void acquire() {
    auto const __test_fn = [this]() -> bool {
      auto __old = __a_.load(memory_order_relaxed);
      return (__old != 0) && __a_.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed);
    };
-    __cxx_atomic_wait(&__a_.__a_, __test_fn);
+    auto const __test_with_old = [this](__cxx_contention_t& __monitor) -> bool {


It's a bit unfortunate that with this design, you need two functions (__test_fn and __test_with_old). This is unneeded duplication and could lead to confusion about which function is used. I think it would be better to always give to __test_fn the current value of the atomic. __cxx_atomic_wait then would take another memory_order parameter which would be used when the atomic is read inside the __cxx_atomic_wait machinery. It would look a bit like this:

- auto const __test_fn = [this]() -> bool { - auto __old = __a.load(memory_order_relaxed); - return (__old != 0) && __a.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed); + auto const __test_fn = [this](ptrdiff_t& __old) -> bool { + while (true) { + if (__old == 0) + return false; + if (__a.compare_exchange_weak(__old, __old - 1, memory_order_acquire, memory_order_relaxed)) + return true; + } }; - __cxx_atomic_wait(&__a.__a_, __test_fn); + __cxx_atomic_wait_fn(&__a.__a_, __test_fn, memory_order_relaxed);

...so __test_fn always takes the ptrdiff_t (current atomic value) as in-out parameter, and the memory_order_relaxed parameter previously used to read __old is given to __cxx_atomic_wait. The advantage is that if the code is refactored like this, it would immediately become apparent that latch suffers from the same race, which could be fixed like this:

inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY void wait() const { - auto const __test_fn = [=]() -> bool { - return try_wait(); + auto const __test_fn = [=](ptrdiff_t& __val) -> bool { + return __val == 0; }; - __cxx_atomic_wait(&__a.__a_, __test_fn); + __cxx_atomic_wait_fn(&__a.__a_, __test_fn, memory_order_acquire); }

I played around a bit with this approach back in the day if you are interested: jiixyj@bc11102

it would immediately become apparent that latch suffers from the same race

I don't think latch has the same problem as the none of the waiters are going to change anything. In fact, I don't understand why latch needs to use this complicated internal functions at all. can't it just do

__a_.wait(0, memory_order_acquire);

I played around a bit with this approach back in the day if you are interested

Thanks for showing this. To be honest, I am confused with these internal functions and I am trying to avoid making it even more complex. I will have more thought and see if it could be simplified a bit

I don't think latch has the same problem as the none of the waiters are going to change anything. In fact, I don't understand why latch needs to use this complicated internal functions at all. can't it just do

__a_.wait(0, memory_order_acquire);

I think it's written in terms of try_wait() and __cxx_atomic_wait to express the separation of concerns -- the latch only has to implement the non-blocking logic in try_wait(), and __cxx_atomic_wait "generically" turns the try_wait() into a blocking algorithm. To do this, __cxx_atomic_wait implements the "event count" pattern:

https://github.com/facebook/folly/blob/dd5e8b4fcef37193cd9676e1122d1621a1d90c49/folly/experimental/EventCount.h

https://cbloomrants.blogspot.com/2011/07/07-08-11-who-ordered-event-count.html

What complicates the internal __cxx_atomic_wait eventcount logic a lot is that sometimes the eventcount futex is from the "parking lot" (i.e. __libcpp_contention_table) and other times it is the atomic variable itself if it's directly usable as a platform futex (leading to those ABA style issues).

https://github.com/facebook/folly/blob/dd5e8b4fcef37193cd9676e1122d1621a1d90c49/folly/experimental/EventCount.h

but I still think

__a_.wait(0, memory_order_acquire);

is simpler and easier to understand.

w.r.t to the latch, do you agree that in the current form, there is no issues like the one in semaphore? All the modification of a latch is going one direction so there should not be ABA problem IIUC

w.r.t to the latch, do you agree that in the current form, there is no issues like the one in semaphore? All the modification of a latch is going one direction so there should not be ABA problem IIUC

Ah yes, you are right! The latch is a downward counter, so the __test_fn could only "err" in the direction of not-blocking. So if the monitor read "1" for example, the __test_fn could only read either "1" as well (and then the algorithm would proceed to block), or read "0" and exit the loop.

I still think it's worth to refactor it so that the __test_fn always takes the current value of the atomic as an in-out-argument, as this would just make this class of ABA issue go away completely. More synchronization primitives might be implemented in terms of __cxx_atomic_wait in the future.

I played around a bit with this approach back in the day if you are interested: jiixyj@bc11102

Just FYI, I'll try and rebase this on current main later today. That way we can better compare with this current PR.

I think we are coming close to an agreement now. yes we both agree to work on the higher level abstraction instead of trying fighting the backoff functions. yes having another function that takes a predicate can be useful too. maybe call it wait_if_not (like find vs find_if).

For semaphore case, I don't think we have to reuse try_acquire in acquire as I think they are a bit different.

We don't need a loop in try_acquire. can we just change it to oneoff thing?

we need compare_exchange_strong in try_acquire and compare_exchange_weak in acquire

something like

bool try_acquire() { auto __old = __a_.load(memory_order_acquire); return __old != 0 && __a_.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed); } void acquire() { auto __old = __a_.load(std::memory_order_relaxed); while (true) { if (__old == 0) { __a_.wait(__old, std::memory_order_relaxed); __old = __a_.load(std::memory_order_relaxed); } else if (__a_.compare_exchange_weak(__old, __old - 1, std::memory_order_acquire, std::memory_order_relaxed)) { break; } }

here is just a small refactoring of semaphore latch barrier to just use existing atomic's API
huixie90@8a4b98a

Going back to the original bug. I don't think the refactoring of all this solved anything. The original lost wake up bug was not in semaphore, or latch etc... It in the atomic::wait's backoff function:

auto const __monitor = std::__libcpp_atomic_monitor(__a); if (__test_fn()) return true; std::__libcpp_atomic_wait(__a, __monitor);

when this function is called, it is possible that the atomic variable has already been changed and __monitor is the new value. so we are waiting for a different value now. (and also there is no happens-before so __test_fn could be evaluated before taking the monitor). so basically we need to fix this function.
(also we need to careful about __libcpp_atomic_monitor returns different thing for atomic<int64t> and everything else

For semaphore case, I don't think we have to reuse try_acquire in acquire as I think they are a bit different.

1. We don't need a loop in `try_acquire`. can we just change it to oneoff thing? 2. we need `compare_exchange_strong` in `try_acquire` and `compare_exchange_weak` in `acquire`

Yes, you don't need a loop in try_acquire(). But then, you don't need compare_exchange_strong either, as weak will do. Imagine this sequence where a consumer calls this version of try_acquire() (which doesn't use a loop and uses compare_exchange_strong), and there is a concurrent producer:

producer consumer sem ------------------------------------- 42 ------------------------------------- old=42 42 ------------------------------------- release(1) 43 ------------------------------------- tries strong 43 cas: fails! ------------------------------------- returns false

...so you might then as well have used compare_exchange_weak as this is allowed to spuriously fail.

edit: Scratch the next part -- I'm not 100% sure, but it could well be that the condition for the cxx_atomic_wait has to be "strong". So if one has a "weak" try_acquire(), one has to implement the acquire() seperately, just as you wrote. I have to think about this some more :)

~~So you can implement acquire() even in terms of this "weaker" try_acquire() -- you just have to make sure that any waiting happens based on the value "0" of the semaphore like this:~~

struct semaphore { std::atomic<std::ptrdiff_t> a_{}; bool try_acquire_impl(std::ptrdiff_t &value) { // if `try_acquire_impl` returns `false`, `value` must be 0! return (value != 0 && a_.compare_exchange_weak(value, value - 1, // std::memory_order_acquire, std::memory_order_relaxed)) || (value = 0, false); } static constexpr auto acquire_initial_load_mo = std::memory_order_relaxed; bool try_acquire() { auto old = a_.load(acquire_initial_load_mo); return try_acquire_impl(old); } void acquire() { using namespace std::placeholders; cxx_atomic_wait(&a_, std::bind(&semaphore::try_acquire_impl, this, _1), acquire_initial_load_mo); } };

when this function is called, it is possible that the atomic variable has already been changed and __monitor is the new value. so we are waiting for a different value now. (and also there is no happens-before so __test_fn could be evaluated before taking the monitor). so basically we need to fix this function.

Agreed! But I don't think the __test_fn can be reordered before taking the monitor, as taking the monitor is a "acquire" operation?

edit: Scratch the next part -- I'm not 100% sure, but it could well be that the condition for the cxx_atomic_wait has to be "strong". So if one has a "weak" try_acquire(), one has to implement the acquire() seperately, just as you wrote. I have to think about this some more :)

I think I got it! You basically can implement try_acquire() in three different ways, and the way you would implement acquire() in terms of that differs significantly:

"strong" try_acquire(): this only ever returns false if it has actually observed the semaphore value to be "0". You can implement acquire() in terms of that if you notify the eventcount if old == 0 in release().

"weak" try_acquire() with strong CAS (return value != 0 && cas_strong(...);): You could implement acquire() in terms of that, but then you would have to notify in any case, not only if old == 0! Example execution:

producer consumer sem evc (eventcount) ------------------------------------------- 42 0 ------------------------------------------- monitor=0 42 0 ------------------------------------------- old=42 42 0 ------------------------------------------- release(1) 43 0 ------------------------------------------- strong CAS(42->41) 43 0 fails ------------------------------------------- evc.notify() 43 1 ^ this is important! ------------------------------------------- evc.wait() 43 1 ends as monitor doesn't match (0!=1) ------------------------------------------- loop restarts and strong CAS 42 1 will succeed this time

"weak" try_acquire(), with weak CAS (return value != 0 && cas_weak(...);): You cannot implement acquire() in terms of that, because the weak CAS could just randomly decide to spuriously fail, and any notifications would get lost.

Sorry for the confused rambling, but I hope this kinda makes sense...

ldionne

I just went through the whole thread. I think this is an amazing discussion, and one that greatly needs to happen since our synchronization library is way too complicated right now.

Personally, I really like the idea that we're able to implement e.g. latch and barrier using std::atomic public APIs exclusively, but I'm also not blindly attached to it.

From a tactical perspective, I would suggest moving forward with this patch even though I know both of you are not satisfied with the way the code looks. At least we are fixing two real bugs and making the code "better" in terms of correctness.

I would then greatly encourage refactorings both in the implementation of barrier, latch & al, and also in the implementation of __cxx_atomic_wait, which is really brittle right now. But I would suggest tackling those as separate patches that would aim not to change the functionality in major ways.

I am fine with having the code in an intermediate state that we're not happy refactoring-wise, since these issues are complex enough that it's worth compartmenting how we tackle them (functional change + follow-up refactor).

I'm approving this patch w/ the __old == 0 optimization and I encourage the discussion about refactoring to keep going either here or on a follow-up PR/issue.

libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp

jiixyj · 2024-02-04T13:48:18Z

I would then greatly encourage refactorings both in the implementation of barrier, latch & al, and also in the implementation of __cxx_atomic_wait, which is really brittle right now. But I would suggest tackling those as separate patches that would aim not to change the functionality in major ways.

I am fine with having the code in an intermediate state that we're not happy refactoring-wise, since these issues are complex enough that it's worth compartmenting how we tackle them (functional change + follow-up refactor).

Sounds good! For __cxx_atomic_wait, I just opened a follow up PR: #80596

Even though the __cxx_atomic_wait function might lose its importance after the refactorings to semaphore and latch suggested by @huixie90 , I think it's still valueable to have its semantics made clearer and its implementation made more correct.

huixie90 · 2024-02-05T13:59:40Z

CI is green and I am merging this PR now. We will continue our discussion in the follow up PRs

Fixes llvm#77659 Fixes llvm#46357 Picked up from https://reviews.llvm.org/D114119

…#80596) This is a follow-up PR to <#79265>. It aims to be a gentle refactoring of the `__cxx_atomic_wait` function that takes a predicate. The key idea here is that this function's signature is changed to look like this (`std::function` used just for clarity): ```c++ __cxx_atomic_wait_fn(Atp*, std::function<bool(Tp &)> poll, memory_order __order); ``` ...where `Tp` is the corresponding `value_type` to the atomic variable type `Atp`. The function's semantics are similar to `atomic`s `.wait()`, but instead of having a hardcoded predicate (is the loaded value unequal to `old`?) the predicate is specified explicitly. The `poll` function may change its argument, and it is very important that if it returns `false`, it leaves its current understanding of the atomic's value in the argument. Internally, `__cxx_atomic_wait_fn` dispatches to two waiting mechanisms, depending on the type of the atomic variable: 1. If the atomic variable can be waited on directly (for example, Linux's futex mechanism only supports waiting on 32 bit long variables), the value of the atomic variable (which `poll` made its decision on) is then given to the underlying system wait function (e.g. futex). 2. If the atomic variable can not be waited on directly, there is a global pool of atomics that are used for this task. The ["eventcount" pattern](<https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041>) is employed to make this possible. The eventcount pattern needs a "monitor" variable which is read before the condition is checked another time. libcxx has the `__libcpp_atomic_monitor` function for this. However, this function only has to be called in case "2", i.e. when the eventcount is actually used. In case "1", the futex is used directly, so the monitor must be the value of the atomic variable that the `poll` function made its decision on to continue blocking. Previously, `__libcpp_atomic_monitor` was _also_ used in case "1". This was the source of the ABA style bug that PR#79265 fixed. However, the solution in PR#79265 has some disadvantages: - It exposes internals such as `cxx_contention_t` or the fact that `__libcpp_thread_poll_with_backoff` needs two functions to higher level constructs such as `semaphore`. - It doesn't prevent consumers calling `__cxx_atomic_wait` in an error prone way, i.e. by providing to it a predicate that doesn't take an argument. This makes ABA style issues more likely to appear. Now, `__cxx_atomic_wait_fn` takes just _one_ function, which is then transformed into the `poll` and `backoff` callables needed by `__libcpp_thread_poll_with_backoff`. Aside from the `__cxx_atomic_wait` changes, the only other change is the weakening of the initial atomic load of `semaphore`'s `try_acquire` into `memory_order_relaxed` and the CAS inside the loop is changed from `strong` to `weak`. Both weakenings should be fine, since the CAS is called in a loop, and the "acquire" semantics of `try_acquire` come from the CAS, not from the initial load.

… (#80596) This is a follow-up PR to <llvm/llvm-project#79265>. It aims to be a gentle refactoring of the `__cxx_atomic_wait` function that takes a predicate. The key idea here is that this function's signature is changed to look like this (`std::function` used just for clarity): ```c++ __cxx_atomic_wait_fn(Atp*, std::function<bool(Tp &)> poll, memory_order __order); ``` ...where `Tp` is the corresponding `value_type` to the atomic variable type `Atp`. The function's semantics are similar to `atomic`s `.wait()`, but instead of having a hardcoded predicate (is the loaded value unequal to `old`?) the predicate is specified explicitly. The `poll` function may change its argument, and it is very important that if it returns `false`, it leaves its current understanding of the atomic's value in the argument. Internally, `__cxx_atomic_wait_fn` dispatches to two waiting mechanisms, depending on the type of the atomic variable: 1. If the atomic variable can be waited on directly (for example, Linux's futex mechanism only supports waiting on 32 bit long variables), the value of the atomic variable (which `poll` made its decision on) is then given to the underlying system wait function (e.g. futex). 2. If the atomic variable can not be waited on directly, there is a global pool of atomics that are used for this task. The ["eventcount" pattern](<https://gist.github.com/mratsim/04a29bdd98d6295acda4d0677c4d0041>) is employed to make this possible. The eventcount pattern needs a "monitor" variable which is read before the condition is checked another time. libcxx has the `__libcpp_atomic_monitor` function for this. However, this function only has to be called in case "2", i.e. when the eventcount is actually used. In case "1", the futex is used directly, so the monitor must be the value of the atomic variable that the `poll` function made its decision on to continue blocking. Previously, `__libcpp_atomic_monitor` was _also_ used in case "1". This was the source of the ABA style bug that PR#79265 fixed. However, the solution in PR#79265 has some disadvantages: - It exposes internals such as `cxx_contention_t` or the fact that `__libcpp_thread_poll_with_backoff` needs two functions to higher level constructs such as `semaphore`. - It doesn't prevent consumers calling `__cxx_atomic_wait` in an error prone way, i.e. by providing to it a predicate that doesn't take an argument. This makes ABA style issues more likely to appear. Now, `__cxx_atomic_wait_fn` takes just _one_ function, which is then transformed into the `poll` and `backoff` callables needed by `__libcpp_thread_poll_with_backoff`. Aside from the `__cxx_atomic_wait` changes, the only other change is the weakening of the initial atomic load of `semaphore`'s `try_acquire` into `memory_order_relaxed` and the CAS inside the loop is changed from `strong` to `weak`. Both weakenings should be fine, since the CAS is called in a loop, and the "acquire" semantics of `try_acquire` come from the CAS, not from the initial load. NOKEYCHECK=True GitOrigin-RevId: 95ebf2be0e6600465a4d0f4e7d81c7ded0559fba

huixie90 requested a review from a team as a code owner January 24, 2024 08:42

llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Jan 24, 2024

ldionne self-assigned this Jan 24, 2024

ldionne approved these changes Jan 26, 2024

View reviewed changes

libcxx/include/semaphore Show resolved Hide resolved

libcxx/include/__atomic/atomic_sync.h Outdated Show resolved Hide resolved

huixie90 added 2 commits January 31, 2024 13:25

[libc++] fix counting_semaphore lost wakeups

d03e134

address feedback

7f731e8

huixie90 force-pushed the fix_semaphore branch from a2306a7 to 7f731e8 Compare January 31, 2024 16:17

jiixyj reviewed Jan 31, 2024

View reviewed changes

rename

280b900

ldionne approved these changes Feb 2, 2024

View reviewed changes

ldionne reviewed Feb 2, 2024

View reviewed changes

libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp Outdated Show resolved Hide resolved

CI

3dbe232

jiixyj mentioned this pull request Feb 4, 2024

[libc++] Refactor the predicate taking variant of __cxx_atomic_wait #80596

Merged

huixie90 merged commit 8256588 into llvm:main Feb 5, 2024

agozillon pushed a commit to agozillon/llvm-project that referenced this pull request Feb 5, 2024

[libc++] fix counting_semaphore lost wakeups (llvm#79265)

19e7b3b

Fixes llvm#77659 Fixes llvm#46357 Picked up from https://reviews.llvm.org/D114119

[libc++] fix counting_semaphore lost wakeups #79265

[libc++] fix counting_semaphore lost wakeups #79265

Uh oh!

Conversation

huixie90 commented Jan 24, 2024 • edited by ldionne Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 24, 2024

Uh oh!

ldionne left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

jiixyj Jan 31, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

huixie90 Jan 31, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jiixyj Jan 31, 2024

Choose a reason for hiding this comment

Uh oh!

huixie90 Jan 31, 2024

Choose a reason for hiding this comment

Uh oh!

jiixyj Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

jiixyj Feb 2, 2024

Choose a reason for hiding this comment

Uh oh!

huixie90 Feb 2, 2024

Choose a reason for hiding this comment

Uh oh!

jiixyj Feb 3, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jiixyj Feb 3, 2024

Choose a reason for hiding this comment

Uh oh!

jiixyj Feb 3, 2024

Choose a reason for hiding this comment

Uh oh!

ldionne left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jiixyj commented Feb 4, 2024

Uh oh!

huixie90 commented Feb 5, 2024

Uh oh!

Uh oh!

[libc++] fix `counting_semaphore` lost wakeups #79265

[libc++] fix `counting_semaphore` lost wakeups #79265

huixie90 commented Jan 24, 2024 •

edited by ldionne

Loading

jiixyj Jan 31, 2024 •

edited

Loading

huixie90 Jan 31, 2024 •

edited

Loading

jiixyj Feb 3, 2024 •

edited

Loading