-
Notifications
You must be signed in to change notification settings - Fork 14.6k
Description
Here is a small example of a lazy-start coroutine promise, with a coroutine Bar
that calls another coroutine (Baz
) that can be inlined, where the inlined coroutine calls yet another one Qux
that cannot.
#include <coroutine>
struct MyTask{
struct promise_type {
MyTask get_return_object() { return {std::coroutine_handle<promise_type>::from_promise(*this)}; }
std::suspend_always initial_suspend() { return {}; }
void unhandled_exception();
void return_void() {}
auto await_transform(MyTask task) {
struct Awaiter {
bool await_ready() { return false; }
std::coroutine_handle<promise_type> await_suspend(std::coroutine_handle<promise_type> h) {
caller.resume_when_done = h;
return std::coroutine_handle<promise_type>::from_promise(callee);
}
void await_resume() {
std::coroutine_handle<promise_type>::from_promise(callee).destroy();
}
promise_type& caller;
promise_type& callee;
};
return Awaiter{*this, task.handle.promise()};
}
auto final_suspend() noexcept {
struct Awaiter {
bool await_ready() noexcept { return false; }
std::coroutine_handle<promise_type> await_suspend(std::coroutine_handle<promise_type> h) noexcept {
return to_resume;
}
void await_resume() noexcept;
std::coroutine_handle<promise_type> to_resume;
};
return Awaiter{resume_when_done};
}
// The coroutine to resume when we're done.
std::coroutine_handle<promise_type> resume_when_done;
};
// A handle for the coroutine that returned this task.
std::coroutine_handle<promise_type> handle;
};
MyTask __attribute__((noinline)) Qux() { co_return; }
MyTask Baz() { co_await Qux(); }
MyTask __attribute__((noinline)) Bar() { co_await Baz(); }
The awaited task's coroutine handle is destroyed immediately upon resumption in await_resume
, so it should be possible to apply HALO and elide the allocation of the coroutine frame for Baz
in Bar
: the frame is both created and destroyed within Bar
. But when compiled with -std=c++20 -O2 -fno-exceptions
(compiler explorer), clang fails to do this:
Bar() [clone .resume]: # @Bar() [clone .resume]
push r14
push rbx
push rax
mov rbx, rdi
cmp byte ptr [rdi + 48], 0
je .LBB7_1
mov rdi, qword ptr [rbx + 32]
call operator delete(void*)@PLT
mov rdi, qword ptr [rbx + 16]
mov qword ptr [rbx + 40], rdi
mov qword ptr [rbx + 24], rdi
mov qword ptr [rbx], 0
add rsp, 8
pop rbx
pop r14
jmp qword ptr [rdi] # TAILCALL
.LBB7_1:
mov edi, 56
call operator new(unsigned long)@PLT
mov r14, rax
mov qword ptr [rbx + 32], rax
lea rax, [rip + Baz() [clone .resume]]
mov qword ptr [r14], rax
lea rax, [rip + Baz() [clone .destroy]]
mov qword ptr [r14 + 8], rax
mov qword ptr [r14 + 16], 0
mov byte ptr [r14 + 48], 0
mov byte ptr [rbx + 48], 1
mov qword ptr [rbx + 16], rbx
call Qux()
mov qword ptr [r14 + 32], rax
mov byte ptr [r14 + 48], 1
mov qword ptr [r14 + 16], r14
mov rdi, rax
add rsp, 8
pop rbx
pop r14
jmp qword ptr [rax] # TAILCALL
If on the other hand we make the minimal change so that the callee's handle is destroyed in ~MyTask
(compiler explorer) HALO is applied and there are no allocations within Bar.resume
.
Can clang be taught to apply HALO when the awaited coroutine's frame is destroyed in the awaiter's await_resume
method?
If you are interested in why I don't want to destroy the handle in ~MyTask
, it's because it causes much worse code to be generated for the frame destroy function. For example, here's what Bar.destroy
looks like in my original example where I destroy the handle in await_resume
:
jmp operator delete(void*)@PLT # TAILCALL
But when the the handle is destroyed in ~MyTask
there is a ton of code generated:
push rbx
mov rbx, rdi
cmp qword ptr [rdi], 0
je .LBB9_5
cmp byte ptr [rbx + 96], 0
je .LBB9_5
cmp qword ptr [rbx + 24], 0
je .LBB9_5
cmp byte ptr [rbx + 72], 0
je .LBB9_5
mov rdi, qword ptr [rbx + 56]
call qword ptr [rdi + 8]
.LBB9_5:
mov rdi, rbx
pop rbx
jmp operator delete(void*)@PLT # TAILCALL
I believe clang is forced to do this, since it must do different cleanup depending on where Bar
is suspended when it is destroyed. But my library doesn't support destroying it anywhere but the final suspend point, so all of this code is unnecessary and we really do just want a tail call to delete
.
Metadata
Metadata
Assignees
Labels
Type
Projects
Status