@@ -74,6 +74,7 @@ const (
74
74
firecrackerStartTimeout = 5 * time .Second
75
75
defaultStopVMTimeout = 5 * time .Second
76
76
defaultShutdownTimeout = 5 * time .Second
77
+ jailerStopTimeout = 3 * time .Second
77
78
78
79
// StartEventName is the topic published to when a VM starts
79
80
StartEventName = "/firecracker-vm/start"
@@ -1171,54 +1172,80 @@ func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequ
1171
1172
return & ptypes.Empty {}, nil
1172
1173
}
1173
1174
1174
- // shutdown will stop the VMM within the provided timeout. It attempts to shutdown gracefully by having
1175
- // agent stop (which is presumed to cause the VM to begin a reboot) and then waiting for the VMM process
1176
- // to exit (via the s.shimCtx.Done() channel). If that fails, StopVMM will be called to force a shutdown (currently
1177
- // via sending SIGTERM). If that fails, the VMM will still be killed via SIGKILL when the shimCtx is canceled.
1178
1175
func (s * service ) shutdown (
1179
1176
requestCtx context.Context ,
1180
1177
timeout time.Duration ,
1181
1178
req * taskAPI.ShutdownRequest ,
1182
1179
) error {
1183
- shutdownCtx , cancel := context . WithTimeout ( requestCtx , timeout )
1184
- defer cancel ()
1180
+ s . logger . Info ( "stopping the VM" )
1181
+
1185
1182
go func () {
1186
- // Once the shutdown procedure is done, the shim needs to shutdown too.
1187
- // This also ensures that if the VMM is still alive, it will receive a
1188
- // SIGKILL via exec.CommandContext
1189
- <- shutdownCtx .Done ()
1190
- s .shimCancel ()
1183
+ s .shutdownLoop (requestCtx , timeout , req )
1191
1184
}()
1192
1185
1193
- s .logger .Info ("stopping the VM" )
1194
-
1195
- // Try to tell agent to exit, causing the VM to begin a reboot. If that
1196
- // fails, try to forcibly stop the VMM. If that too fails, just cancel
1197
- // the shutdownCtx to fast-path to the VMM getting SIGKILL.
1198
- _ , shutdownErr := s .agentClient .Shutdown (shutdownCtx , req )
1199
- if shutdownErr != nil {
1200
- s .logger .WithError (shutdownErr ).Error ("failed to shutdown VM agent" )
1201
- stopVMMErr := s .machine .StopVMM ()
1202
- if stopVMMErr != nil {
1203
- s .logger .WithError (stopVMMErr ).Error ("failed to forcibly stop VMM" )
1204
- cancel ()
1205
- }
1186
+ err := s .machine .Wait (context .Background ())
1187
+ if err == nil {
1188
+ return nil
1206
1189
}
1190
+ return status .Error (codes .Internal , fmt .Sprintf ("the VMM was killed forcibly: %v" , err ))
1191
+ }
1207
1192
1208
- // wait for the shimCtx to be done, which means the VM has exited and we're ready
1209
- // to shutdown
1210
- <- s .shimCtx .Done ()
1211
- if shutdownCtx .Err () == context .DeadlineExceeded {
1212
- return status .Error (codes .DeadlineExceeded ,
1213
- "timed out waiting for VM shutdown, VMM was sent SIGKILL" )
1214
- }
1193
+ // shutdownLoop sends multiple different shutdown requests to stop the VMM.
1194
+ // 1) send a request to the in-VM agent, which is presumed to cause the VM to begin a reboot.
1195
+ // 2) stop the VM through jailer#Stop(). The signal should be visible from the VMM (e.g. SIGTERM)
1196
+ // 3) stop the VM through cancelling the associated context. The signal would not be visible from the VMM (e.g. SIGKILL)
1197
+ func (s * service ) shutdownLoop (
1198
+ requestCtx context.Context ,
1199
+ timeout time.Duration ,
1200
+ req * taskAPI.ShutdownRequest ,
1201
+ ) {
1202
+ actions := []struct {
1203
+ name string
1204
+ shutdown func () error
1205
+ timeout time.Duration
1206
+ }{
1207
+ {
1208
+ name : "send a request to the in-VM agent" ,
1209
+ shutdown : func () error {
1210
+ _ , err := s .agentClient .Shutdown (requestCtx , req )
1211
+ if err != nil {
1212
+ return err
1213
+ }
1214
+ return nil
1215
+ },
1216
+ timeout : timeout ,
1217
+ },
1218
+ {
1219
+ name : "stop the jailer" ,
1220
+ shutdown : func () error {
1221
+ return s .jailer .Stop ()
1222
+ },
1223
+ timeout : jailerStopTimeout ,
1224
+ },
1225
+ {
1226
+ name : "cancel the context" ,
1227
+ shutdown : func () error {
1228
+ s .shimCancel ()
1229
+ return nil
1230
+ },
1231
+ },
1232
+ }
1233
+
1234
+ for _ , action := range actions {
1235
+ pid , err := s .machine .PID ()
1236
+ if pid == 0 && err != nil {
1237
+ break // we have nothing to kill
1238
+ }
1215
1239
1216
- if s .vmExitErr != nil {
1217
- return status .Error (codes .Internal ,
1218
- fmt .Sprintf ("VMM exit errors: %v" , s .vmExitErr ))
1240
+ s .logger .Debug (action .name )
1241
+ err = action .shutdown ()
1242
+ if err != nil {
1243
+ // if sending an request doesn't succeed, don't wait and carry on.
1244
+ s .logger .WithError (err ).Errorf ("failed to %s" , action .name )
1245
+ } else {
1246
+ time .Sleep (action .timeout )
1247
+ }
1219
1248
}
1220
-
1221
- return nil
1222
1249
}
1223
1250
1224
1251
func (s * service ) Stats (requestCtx context.Context , req * taskAPI.StatsRequest ) (* taskAPI.StatsResponse , error ) {
0 commit comments