Skip to content

Commit 1dff11d

Browse files
committed
tt: fix the race condition between start and stop
This patch fixes the signals handling strategy. Now there is only one handling loop for all signals. Added two fields for controlling a state of the instance being watched and a mutex for synchronizing the goroutines changing them.
1 parent 7f9c9aa commit 1dff11d

File tree

2 files changed

+48
-43
lines changed

2 files changed

+48
-43
lines changed

cli/running/running.go

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"io/ioutil"
77
"os"
88
"os/exec"
9-
"os/signal"
109
"path"
1110
"path/filepath"
1211
"strings"
@@ -20,7 +19,6 @@ import (
2019
"github.com/tarantool/tt/cli/process_utils"
2120
"github.com/tarantool/tt/cli/ttlog"
2221
"github.com/tarantool/tt/cli/util"
23-
"golang.org/x/net/context"
2422
"gopkg.in/yaml.v2"
2523
)
2624

@@ -516,32 +514,20 @@ func FillCtx(cliOpts *config.CliOpts, cmdCtx *cmdcontext.CmdCtx,
516514

517515
// Start an Instance.
518516
func Start(cmdCtx *cmdcontext.CmdCtx, run *InstanceCtx) error {
519-
// Register a context, related with a signal handler that
520-
// removes a pid file if interrupt signals were sent before starting
521-
// an instance and watchdog signal handlers.
522-
sigCtx, stop := signal.NotifyContext(context.Background(),
523-
syscall.SIGINT, syscall.SIGTERM)
517+
logger := createLogger(run)
518+
provider := providerImpl{cmdCtx: cmdCtx, instanceCtx: run}
519+
wd := NewWatchdog(run.Restartable, 5*time.Second, logger, &provider)
520+
wd.StartSignalHandling()
521+
524522
if err := process_utils.CreatePIDFile(run.PIDFile); err != nil {
525523
return err
526524
}
527525

528526
defer func() {
529527
cleanup(run)
530-
stop()
531528
}()
532529

533-
logger := createLogger(run)
534-
provider := providerImpl{cmdCtx: cmdCtx, instanceCtx: run}
535-
wd := NewWatchdog(run.Restartable, 5*time.Second, logger, &provider)
536-
537-
// The Done() state of the context means that the process received
538-
// an interrupt signal and there is a need to clean up the resources.
539-
select {
540-
case <-sigCtx.Done():
541-
return nil
542-
default:
543-
wd.Start()
544-
}
530+
wd.Start()
545531
return nil
546532
}
547533

cli/running/watchdog.go

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,12 @@ type Watchdog struct {
3535
// restartTimeout describes the timeout between
3636
// restarting the Instance.
3737
restartTimeout time.Duration
38-
// stopped indicates whether Watchdog was stopped.
39-
stopped bool
40-
// stopMutex used to avoid a race condition under stopped field.
41-
stopMutex sync.Mutex
38+
// watchdogStateMutex used to avoid a race condition under started and shouldStop fields.
39+
watchdogStateMutex sync.Mutex
40+
// started indicates whether Watchdog has started an Instance.
41+
started bool
42+
// shouldStop indicates whether Watchdog should stop the Instance.
43+
shouldStop bool
4244
// done channel used to inform the signal handle goroutine
4345
// about termination of the Instance.
4446
done chan bool
@@ -55,7 +57,7 @@ func NewWatchdog(restartable bool, restartTimeout time.Duration, logger *ttlog.L
5557
provider: provider}
5658

5759
wd.done = make(chan bool, 1)
58-
wd.stopped = false
60+
wd.shouldStop = false
5961

6062
return &wd
6163
}
@@ -72,23 +74,32 @@ func (wd *Watchdog) Start() {
7274
break
7375
}
7476
wd.logger = wd.Instance.logger
75-
// Start the Instance and forwarding signals (except SIGINT and SIGTERM)
76-
wd.startSignalHandling()
77-
wd.stopMutex.Lock()
78-
if !wd.stopped {
77+
78+
wd.watchdogStateMutex.Lock()
79+
if !wd.shouldStop {
80+
// Start the Instance and forwarding signals (except SIGINT and SIGTERM)
7981
if err := wd.Instance.Start(); err != nil {
8082
wd.logger.Printf(`Watchdog(ERROR): "%v".`, err)
81-
wd.stopMutex.Unlock()
83+
wd.watchdogStateMutex.Unlock()
8284
break
8385
}
86+
wd.started = true
87+
} else {
88+
wd.logger.Printf(`Watchdog(ERROR): terminated before start.`)
89+
wd.watchdogStateMutex.Unlock()
90+
return
8491
}
85-
wd.stopMutex.Unlock()
92+
wd.watchdogStateMutex.Unlock()
8693

8794
// Wait while the Instance will be terminated.
8895
if err := wd.Instance.Wait(); err != nil {
8996
wd.logger.Printf(`Watchdog(WARN): "%v".`, err)
9097
}
9198

99+
wd.watchdogStateMutex.Lock()
100+
wd.started = false
101+
wd.watchdogStateMutex.Unlock()
102+
92103
// Set Instance process completion indication.
93104
wd.done <- true
94105
// Wait for the signal processing goroutine to complete.
@@ -100,7 +111,7 @@ func (wd *Watchdog) Start() {
100111
wd.logger.Println("Watchdog(ERROR): can't check if the instance is restartable.")
101112
break
102113
}
103-
if wd.stopped || !restartable {
114+
if wd.shouldStop || !restartable {
104115
wd.logger.Println("Watchdog(INFO): the Instance has shutdown.")
105116
break
106117
}
@@ -112,15 +123,18 @@ func (wd *Watchdog) Start() {
112123
wd.logger = logger
113124
}
114125
time.Sleep(wd.restartTimeout)
126+
127+
wd.shouldStop = false
128+
// Before the restart of an instance start a new signal handling loop.
129+
wd.StartSignalHandling()
115130
}
116131
}
117132

118-
// startSignalHandling starts signal handling in a separate goroutine.
119-
func (wd *Watchdog) startSignalHandling() {
133+
// StartSignalHandling starts signal handling in a separate goroutine.
134+
func (wd *Watchdog) StartSignalHandling() {
120135
sigChan := make(chan os.Signal, 1)
121-
// Reset unregisters all previous handlers for interrupt signals.
122-
signal.Reset(syscall.SIGINT,
123-
syscall.SIGTERM, syscall.SIGHUP)
136+
// Reset the signal mask before starting of the new loop.
137+
signal.Reset()
124138
signal.Notify(sigChan)
125139

126140
// Set barrier to synchronize with the main loop when the Instance stops.
@@ -136,20 +150,25 @@ func (wd *Watchdog) startSignalHandling() {
136150
case sig := <-sigChan:
137151
switch sig {
138152
case syscall.SIGINT, syscall.SIGTERM:
139-
wd.stopMutex.Lock()
140-
wd.Instance.Stop(30 * time.Second)
153+
wd.watchdogStateMutex.Lock()
154+
if wd.started {
155+
wd.Instance.Stop(30 * time.Second)
156+
}
141157
// If we receive one of the "stop" signals, the
142158
// program should be terminated.
143-
wd.stopped = true
144-
wd.stopMutex.Unlock()
159+
wd.shouldStop = true
160+
wd.watchdogStateMutex.Unlock()
145161
case syscall.SIGHUP:
146162
// Rotate the log files.
147163
wd.logger.Rotate()
148164
default:
149-
wd.Instance.SendSignal(sig)
165+
wd.watchdogStateMutex.Lock()
166+
if wd.started {
167+
wd.Instance.SendSignal(sig)
168+
}
169+
wd.watchdogStateMutex.Unlock()
150170
}
151171
case _ = <-wd.done:
152-
signal.Reset()
153172
return
154173
}
155174
}

0 commit comments

Comments
 (0)