diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 1a4b3e9a2145c..5288c808ca6bb 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -177,6 +177,15 @@ add_header_library( libc.src.__support.CPP.array ) +add_header_library( + fixedstack + HDRS + fixedstack.h + DEPENDS + libc.src.__support.CPP.array + libc.src.__support.CPP.atomic +) + add_header_library( char_vector HDRS diff --git a/libc/src/__support/fixedstack.h b/libc/src/__support/fixedstack.h new file mode 100644 index 0000000000000..ef400bb6c5a42 --- /dev/null +++ b/libc/src/__support/fixedstack.h @@ -0,0 +1,130 @@ +//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H +#define LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/atomic.h" +#include "src/__support/threads/sleep.h" + +#include + +namespace LIBC_NAMESPACE { + +// A lock-free fixed size stack backed by an underlying cpp::array data +// structure. It supports push and pop operations in a thread safe manner. +template class alignas(16) FixedStack { + // The index is stored as a 20-bit value and cannot index into any more. + static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size"); + + // The head of the free and used stacks. Represents as a 20-bit index combined + // with a 44-bit ABA tag that is updated in a single atomic operation. + uint64_t free; + uint64_t used; + + // The stack is a linked list of indices into the underlying data + cpp::array next; + cpp::array data; + + // Get the 20-bit index into the underlying array from the head. + static constexpr uint32_t get_node(uint64_t head) { + return static_cast(head & 0xfffff); + } + + // Increment the old ABA tag and merge it into the new index. + static constexpr uint64_t make_new_head(uint64_t orig, uint32_t node) { + return static_cast(node) | (((orig >> 20ul) + 1ul) << 20ul); + } + + // Helper macros for the atomic operations. We cannot use the standard + // cpp::atomic helpers because the initializer will no longer be constexpr and + // the NVPTX backend cannot currently support all of the atomics. +#define atomic_load(val, mem_order) __atomic_load_n(val, (int)mem_order) +#define atomic_cas(val, expected, desired, success_order, failure_order) \ + __atomic_compare_exchange_n(val, expected, desired, /*weak=*/true, \ + (int)success_order, (int)failure_order) + + // Attempts to pop data from the given stack by making it point to the next + // node. We repeatedly attempt to write to the head using compare-and-swap, + // expecting that it has not been changed by any other thread. + uint32_t pop_impl(uint64_t *head) { + uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED); + + for (;;) { + if (get_node(orig) == CAPACITY) + return CAPACITY; + + uint32_t node = + atomic_load(&next[get_node(orig)], cpp::MemoryOrder::RELAXED); + if (atomic_cas(head, &orig, make_new_head(orig, node), + cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED)) + break; + sleep_briefly(); + } + return get_node(orig); + } + + // Attempts to push data to the given stack by making it point to the new + // node. We repeatedly attempt to write to the head using compare-and-swap, + // expecting that it has not been changed by any other thread. + uint32_t push_impl(uint64_t *head, uint32_t node) { + uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED); + for (;;) { + next[node] = get_node(orig); + if (atomic_cas(head, &orig, make_new_head(orig, node), + cpp::MemoryOrder::RELEASE, cpp::MemoryOrder::RELAXED)) + break; + sleep_briefly(); + } + return get_node(*head); + } + +public: + // Initialize the free stack to be full and the used stack to be empty. We use + // the capacity of the stack as a sentinel value. + constexpr FixedStack() : free(0), used(CAPACITY), data{} { + for (uint32_t i = 0; i < CAPACITY; ++i) + next[i] = i + 1; + } + + bool push(const T &val) { + uint32_t node = pop_impl(&free); + if (node == CAPACITY) + return false; + + data[node] = val; + push_impl(&used, node); + return true; + } + + bool pop(T &val) { + uint32_t node = pop_impl(&used); + if (node == CAPACITY) + return false; + + val = data[node]; + push_impl(&free, node); + return true; + } + + bool empty() const { + return get_node(atomic_load(&used, cpp::MemoryOrder::RELAXED)) == CAPACITY; + } + + bool full() const { + return get_node(atomic_load(&free, cpp::MemoryOrder::RELAXED)) == CAPACITY; + } + +#undef atomic_load +#undef atomic_cas +}; + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H diff --git a/libc/test/integration/src/__support/CMakeLists.txt b/libc/test/integration/src/__support/CMakeLists.txt index 7c853ff10259f..4eaa8f5026981 100644 --- a/libc/test/integration/src/__support/CMakeLists.txt +++ b/libc/test/integration/src/__support/CMakeLists.txt @@ -1 +1,5 @@ +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${LIBC_TARGET_OS}) +endif() + add_subdirectory(threads) diff --git a/libc/test/integration/src/__support/gpu/CMakeLists.txt b/libc/test/integration/src/__support/gpu/CMakeLists.txt new file mode 100644 index 0000000000000..d83a762cacbd8 --- /dev/null +++ b/libc/test/integration/src/__support/gpu/CMakeLists.txt @@ -0,0 +1,19 @@ +add_custom_target(support-gpu-integration-tests) +add_dependencies(libc-integration-tests support-gpu-integration-tests) + +add_integration_test( + support_fixed_stack_test + SUITE support-gpu-integration-tests + SRCS + fixed_stack_test.cpp + DEPENDS + libc.src.__support.GPU.utils + libc.src.__support.fixedstack + LOADER_ARGS + --blocks-x 2 + --blocks-y 2 + --blocks-z 2 + --threads-x 4 + --threads-y 4 + --threads-z 4 +) diff --git a/libc/test/integration/src/__support/gpu/fixed_stack_test.cpp b/libc/test/integration/src/__support/gpu/fixed_stack_test.cpp new file mode 100644 index 0000000000000..52fad14802360 --- /dev/null +++ b/libc/test/integration/src/__support/gpu/fixed_stack_test.cpp @@ -0,0 +1,75 @@ +//===-- Integration test for the lock-free stack --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/GPU/utils.h" +#include "src/__support/fixedstack.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +void single_thread() { + // FIXME: The NVPTX backend cannot handle atomic CAS on a local address space. +#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) + FixedStack local_stack; + + for (int i = 0; i < 16; ++i) + EXPECT_TRUE(local_stack.push(i)); + ASSERT_TRUE(local_stack.full()); + + int val; + for (int i = 0; i < 16; ++i) { + EXPECT_TRUE(local_stack.pop(val)); + EXPECT_EQ(val, 16 - 1 - i); + } + ASSERT_TRUE(local_stack.empty()); +#endif +} + +static FixedStack global_stack; +void multiple_threads() { + // We need enough space in the stack as threads in flight can temporarily + // consume memory before they finish comitting it back to the stack. + ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512); + + uint32_t val; + uint32_t num_threads = static_cast(gpu::get_num_threads()); + for (int i = 0; i < 256; ++i) { + EXPECT_TRUE(global_stack.push(UINT32_MAX)) + EXPECT_TRUE(global_stack.pop(val)) + ASSERT_TRUE(val < num_threads || val == UINT32_MAX); + } + + EXPECT_TRUE(global_stack.push(static_cast(gpu::get_thread_id()))); + EXPECT_TRUE(global_stack.push(static_cast(gpu::get_thread_id()))); + EXPECT_TRUE(global_stack.pop(val)); + ASSERT_TRUE(val < num_threads || val == UINT32_MAX); + + // Fill the rest of the stack with the default value. + while (!global_stack.push(UINT32_MAX)) + ; +} + +// Once all the threads have finished executing check the final state of the +// stack. Destructors are always run with a single thread on the GPU. +[[gnu::destructor]] void check_stack() { + ASSERT_FALSE(global_stack.empty()); + + while (!global_stack.empty()) { + uint32_t val; + ASSERT_TRUE(global_stack.pop(val)); + ASSERT_TRUE(val < 64 || val == UINT32_MAX); + } +} + +TEST_MAIN(int argc, char **argv, char **envp) { + single_thread(); + + multiple_threads(); + + return 0; +} diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 7200ac276fe50..4810aa5a31c5a 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -117,6 +117,16 @@ add_libc_test( libc.src.__support.fixedvector ) +add_libc_test( + fixedstack_test + SUITE + libc-support-tests + SRCS + fixedstack_test.cpp + DEPENDS + libc.src.__support.fixedstack +) + add_libc_test( char_vector_test SUITE diff --git a/libc/test/src/__support/fixedstack_test.cpp b/libc/test/src/__support/fixedstack_test.cpp new file mode 100644 index 0000000000000..cbbffb91311a2 --- /dev/null +++ b/libc/test/src/__support/fixedstack_test.cpp @@ -0,0 +1,26 @@ +//===-- Unittests for FixedStack ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/fixedstack.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcFixedVectorTest, PushAndPop) { + static LIBC_NAMESPACE::FixedStack fixed_stack; + ASSERT_TRUE(fixed_stack.empty()); + for (int i = 0; i < 20; i++) + ASSERT_TRUE(fixed_stack.push(i)); + ASSERT_FALSE(fixed_stack.empty()); + ASSERT_FALSE(fixed_stack.push(123)); + int val; + for (int i = 20; i > 0; --i) { + ASSERT_TRUE(fixed_stack.pop(val)); + ASSERT_EQ(val, i - 1); + } + ASSERT_FALSE(fixed_stack.pop(val)); + ASSERT_TRUE(fixed_stack.empty()); +}