Skip to content

Commit f0863fa

Browse files
committed
tt: fix the race condition between start and stop
This patch fixes the signals handling strategy. Now there is only one handling loop for all signals. Added a field for controlling a state of the watchdog and a mutex for synchronizing the goroutines changing this field. Part of #325
1 parent 7f9c9aa commit f0863fa

File tree

3 files changed

+69
-50
lines changed

3 files changed

+69
-50
lines changed

cli/running/running.go

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"io/ioutil"
77
"os"
88
"os/exec"
9-
"os/signal"
109
"path"
1110
"path/filepath"
1211
"strings"
@@ -20,7 +19,6 @@ import (
2019
"github.com/tarantool/tt/cli/process_utils"
2120
"github.com/tarantool/tt/cli/ttlog"
2221
"github.com/tarantool/tt/cli/util"
23-
"golang.org/x/net/context"
2422
"gopkg.in/yaml.v2"
2523
)
2624

@@ -516,32 +514,21 @@ func FillCtx(cliOpts *config.CliOpts, cmdCtx *cmdcontext.CmdCtx,
516514

517515
// Start an Instance.
518516
func Start(cmdCtx *cmdcontext.CmdCtx, run *InstanceCtx) error {
519-
// Register a context, related with a signal handler that
520-
// removes a pid file if interrupt signals were sent before starting
521-
// an instance and watchdog signal handlers.
522-
sigCtx, stop := signal.NotifyContext(context.Background(),
523-
syscall.SIGINT, syscall.SIGTERM)
524-
if err := process_utils.CreatePIDFile(run.PIDFile); err != nil {
525-
return err
526-
}
517+
logger := createLogger(run)
518+
provider := providerImpl{cmdCtx: cmdCtx, instanceCtx: run}
519+
wd := NewWatchdog(run.Restartable, 5*time.Second, logger, &provider)
527520

528521
defer func() {
529522
cleanup(run)
530-
stop()
531523
}()
532524

533-
logger := createLogger(run)
534-
provider := providerImpl{cmdCtx: cmdCtx, instanceCtx: run}
535-
wd := NewWatchdog(run.Restartable, 5*time.Second, logger, &provider)
536-
537-
// The Done() state of the context means that the process received
538-
// an interrupt signal and there is a need to clean up the resources.
539-
select {
540-
case <-sigCtx.Done():
525+
preStartAction := func() error {
526+
if err := process_utils.CreatePIDFile(run.PIDFile); err != nil {
527+
return err
528+
}
541529
return nil
542-
default:
543-
wd.Start()
544530
}
531+
wd.Start(preStartAction)
545532
return nil
546533
}
547534

cli/running/watchdog.go

Lines changed: 57 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,17 @@ type Watchdog struct {
3535
// restartTimeout describes the timeout between
3636
// restarting the Instance.
3737
restartTimeout time.Duration
38-
// stopped indicates whether Watchdog was stopped.
39-
stopped bool
40-
// stopMutex used to avoid a race condition under stopped field.
41-
stopMutex sync.Mutex
4238
// done channel used to inform the signal handle goroutine
4339
// about termination of the Instance.
4440
done chan bool
4541
// provider provides Watchdog methods to get objects whose creation
4642
// and updating may depend on changing external parameters
4743
// (such as configuration file).
4844
provider Provider
45+
// stopMutex used to avoid a race condition under shouldStop field.
46+
stopMutex sync.Mutex
47+
// shouldStop indicates whether the Watchdog should be stopped.
48+
shouldStop bool
4949
}
5050

5151
// NewWatchdog creates a new instance of Watchdog.
@@ -55,33 +55,49 @@ func NewWatchdog(restartable bool, restartTimeout time.Duration, logger *ttlog.L
5555
provider: provider}
5656

5757
wd.done = make(chan bool, 1)
58-
wd.stopped = false
5958

6059
return &wd
6160
}
6261

6362
// Start starts the Instance and signal handling.
64-
func (wd *Watchdog) Start() {
63+
func (wd *Watchdog) Start(preStartAction func() error) error {
64+
var err error
65+
// Create Instance.
66+
if wd.Instance, err = wd.provider.CreateInstance(wd.logger); err != nil {
67+
wd.logger.Printf(`Watchdog(ERROR): "%v".`, err)
68+
return err
69+
}
70+
wd.logger = wd.Instance.logger
71+
// The signal handling loop must be started before the instance
72+
// get started for avoiding a race condition between tt start
73+
// and tt stop. This way we avoid a situation when we receive
74+
// a signal before starting a handler for it.
75+
wd.startSignalHandling()
76+
77+
if err = preStartAction(); err != nil {
78+
wd.logger.Printf(`Pre-start action error: %v`, err)
79+
// Finish the signal handling goroutine.
80+
wd.done <- true
81+
return err
82+
}
83+
6584
// The Instance must be restarted on completion if the "restartable"
6685
// parameter is set to "true".
6786
for {
6887
var err error
69-
// Create Instance.
70-
if wd.Instance, err = wd.provider.CreateInstance(wd.logger); err != nil {
88+
89+
wd.stopMutex.Lock()
90+
if wd.shouldStop {
91+
wd.logger.Printf(`Watchdog(ERROR): terminated before instance start.`)
92+
wd.stopMutex.Unlock()
93+
return nil
94+
}
95+
// Start the Instance.
96+
if err := wd.Instance.Start(); err != nil {
7197
wd.logger.Printf(`Watchdog(ERROR): "%v".`, err)
98+
wd.stopMutex.Unlock()
7299
break
73100
}
74-
wd.logger = wd.Instance.logger
75-
// Start the Instance and forwarding signals (except SIGINT and SIGTERM)
76-
wd.startSignalHandling()
77-
wd.stopMutex.Lock()
78-
if !wd.stopped {
79-
if err := wd.Instance.Start(); err != nil {
80-
wd.logger.Printf(`Watchdog(ERROR): "%v".`, err)
81-
wd.stopMutex.Unlock()
82-
break
83-
}
84-
}
85101
wd.stopMutex.Unlock()
86102

87103
// Wait while the Instance will be terminated.
@@ -100,7 +116,7 @@ func (wd *Watchdog) Start() {
100116
wd.logger.Println("Watchdog(ERROR): can't check if the instance is restartable.")
101117
break
102118
}
103-
if wd.stopped || !restartable {
119+
if wd.shouldStop || !restartable {
104120
wd.logger.Println("Watchdog(INFO): the Instance has shutdown.")
105121
break
106122
}
@@ -112,15 +128,26 @@ func (wd *Watchdog) Start() {
112128
wd.logger = logger
113129
}
114130
time.Sleep(wd.restartTimeout)
131+
132+
wd.shouldStop = false
133+
134+
// Recreate Instance.
135+
if wd.Instance, err = wd.provider.CreateInstance(wd.logger); err != nil {
136+
wd.logger.Printf(`Watchdog(ERROR): "%v".`, err)
137+
return err
138+
}
139+
wd.logger = wd.Instance.logger
140+
// Before the restart of an instance start a new signal handling loop.
141+
wd.startSignalHandling()
115142
}
143+
return nil
116144
}
117145

118146
// startSignalHandling starts signal handling in a separate goroutine.
119147
func (wd *Watchdog) startSignalHandling() {
120148
sigChan := make(chan os.Signal, 1)
121-
// Reset unregisters all previous handlers for interrupt signals.
122-
signal.Reset(syscall.SIGINT,
123-
syscall.SIGTERM, syscall.SIGHUP)
149+
// Reset the signal mask before starting of the new loop.
150+
signal.Reset()
124151
signal.Notify(sigChan)
125152

126153
// Set barrier to synchronize with the main loop when the Instance stops.
@@ -137,19 +164,22 @@ func (wd *Watchdog) startSignalHandling() {
137164
switch sig {
138165
case syscall.SIGINT, syscall.SIGTERM:
139166
wd.stopMutex.Lock()
140-
wd.Instance.Stop(30 * time.Second)
141167
// If we receive one of the "stop" signals, the
142168
// program should be terminated.
143-
wd.stopped = true
169+
wd.shouldStop = true
144170
wd.stopMutex.Unlock()
171+
if wd.Instance.IsAlive() {
172+
wd.Instance.Stop(30 * time.Second)
173+
}
145174
case syscall.SIGHUP:
146175
// Rotate the log files.
147176
wd.logger.Rotate()
148177
default:
149-
wd.Instance.SendSignal(sig)
178+
if wd.Instance.IsAlive() {
179+
wd.Instance.SendSignal(sig)
180+
}
150181
}
151182
case _ = <-wd.done:
152-
signal.Reset()
153183
return
154184
}
155185
}

cli/running/watchdog_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,9 @@ func TestWatchdogBase(t *testing.T) {
112112
t.Cleanup(func() { cleanupWatchdog(wd) })
113113

114114
wdDoneChan := make(chan bool, 1)
115+
testPreAction := func() error { return nil }
115116
go func() {
116-
wd.Start()
117+
wd.Start(testPreAction)
117118
wdDoneChan <- true
118119
}()
119120
waitProcessStart()
@@ -140,8 +141,9 @@ func TestWatchdogNotRestartable(t *testing.T) {
140141
t.Cleanup(func() { cleanupWatchdog(wd) })
141142

142143
wdDoneChan := make(chan bool, 1)
144+
testPreAction := func() error { return nil }
143145
go func() {
144-
wd.Start()
146+
wd.Start(testPreAction)
145147
wdDoneChan <- true
146148
}()
147149
waitProcessStart()

0 commit comments

Comments
 (0)