Skip to content

Commit d08f436

Browse files
authored
Refactor graceful manager, fix misused WaitGroup (#29738)
Follow #29629
1 parent 7a6260f commit d08f436

File tree

4 files changed

+55
-51
lines changed

4 files changed

+55
-51
lines changed

modules/graceful/manager.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,10 @@ func (g *Manager) setStateTransition(old, new state) bool {
233233
// At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init,
234234
// so this function MUST be called if a server is not used.
235235
func (g *Manager) InformCleanup() {
236-
g.createServerWaitGroup.Done()
236+
g.createServerCond.L.Lock()
237+
defer g.createServerCond.L.Unlock()
238+
g.createdServer++
239+
g.createServerCond.Signal()
237240
}
238241

239242
// Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating

modules/graceful/manager_common.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@ type Manager struct {
4242
terminateCtxCancel context.CancelFunc
4343
managerCtxCancel context.CancelFunc
4444
runningServerWaitGroup sync.WaitGroup
45-
createServerWaitGroup sync.WaitGroup
4645
terminateWaitGroup sync.WaitGroup
46+
createServerCond sync.Cond
47+
createdServer int
4748
shutdownRequested chan struct{}
4849

4950
toRunAtShutdown []func()
@@ -52,7 +53,7 @@ type Manager struct {
5253

5354
func newGracefulManager(ctx context.Context) *Manager {
5455
manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})}
55-
manager.createServerWaitGroup.Add(numberOfServersToCreate)
56+
manager.createServerCond.L = &sync.Mutex{}
5657
manager.prepare(ctx)
5758
manager.start()
5859
return manager

modules/graceful/manager_unix.go

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -57,37 +57,35 @@ func (g *Manager) start() {
5757
// Handle clean up of unused provided listeners and delayed start-up
5858
startupDone := make(chan struct{})
5959
go func() {
60-
defer close(startupDone)
61-
// Wait till we're done getting all the listeners and then close the unused ones
62-
func() {
63-
// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
64-
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
65-
// There is no clear solution besides a complete rewriting of the "manager"
66-
defer func() {
67-
_ = recover()
68-
}()
69-
g.createServerWaitGroup.Wait()
60+
defer func() {
61+
close(startupDone)
62+
// Close the unused listeners and ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
63+
_ = CloseProvidedListeners()
7064
}()
71-
// Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
72-
_ = CloseProvidedListeners()
73-
g.notify(readyMsg)
65+
// Wait for all servers to be created
66+
g.createServerCond.L.Lock()
67+
for {
68+
if g.createdServer >= numberOfServersToCreate {
69+
g.createServerCond.L.Unlock()
70+
g.notify(readyMsg)
71+
return
72+
}
73+
select {
74+
case <-g.IsShutdown():
75+
g.createServerCond.L.Unlock()
76+
return
77+
default:
78+
}
79+
g.createServerCond.Wait()
80+
}
7481
}()
7582
if setting.StartupTimeout > 0 {
7683
go func() {
7784
select {
7885
case <-startupDone:
7986
return
8087
case <-g.IsShutdown():
81-
func() {
82-
// When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it.
83-
defer func() {
84-
_ = recover()
85-
}()
86-
// Ensure that the createServerWaitGroup stops waiting
87-
for {
88-
g.createServerWaitGroup.Done()
89-
}
90-
}()
88+
g.createServerCond.Signal()
9189
return
9290
case <-time.After(setting.StartupTimeout):
9391
log.Error("Startup took too long! Shutting down")

modules/graceful/manager_windows.go

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -149,33 +149,35 @@ hammerLoop:
149149
func (g *Manager) awaitServer(limit time.Duration) bool {
150150
c := make(chan struct{})
151151
go func() {
152-
defer close(c)
153-
func() {
154-
// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
155-
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
156-
// There is no clear solution besides a complete rewriting of the "manager"
157-
defer func() {
158-
_ = recover()
159-
}()
160-
g.createServerWaitGroup.Wait()
161-
}()
152+
g.createServerCond.L.Lock()
153+
for {
154+
if g.createdServer >= numberOfServersToCreate {
155+
g.createServerCond.L.Unlock()
156+
close(c)
157+
return
158+
}
159+
select {
160+
case <-g.IsShutdown():
161+
g.createServerCond.L.Unlock()
162+
return
163+
default:
164+
}
165+
g.createServerCond.Wait()
166+
}
162167
}()
168+
169+
var tc <-chan time.Time
163170
if limit > 0 {
164-
select {
165-
case <-c:
166-
return true // completed normally
167-
case <-time.After(limit):
168-
return false // timed out
169-
case <-g.IsShutdown():
170-
return false
171-
}
172-
} else {
173-
select {
174-
case <-c:
175-
return true // completed normally
176-
case <-g.IsShutdown():
177-
return false
178-
}
171+
tc = time.After(limit)
172+
}
173+
select {
174+
case <-c:
175+
return true // completed normally
176+
case <-tc:
177+
return false // timed out
178+
case <-g.IsShutdown():
179+
g.createServerCond.Signal()
180+
return false
179181
}
180182
}
181183

0 commit comments

Comments
 (0)