Skip to content

Commit cae3924

Browse files
authored
Merge pull request #324 from kzys/monitor-vm
Cleanup around Shutdown() and StopVM()
2 parents 6336eaa + 4c59982 commit cae3924

File tree

9 files changed

+136
-79
lines changed

9 files changed

+136
-79
lines changed

firecracker-control/local.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,17 @@ func (s *local) newShim(ns, vmID, containerdAddress string, shimSocket *net.Unix
369369

370370
// make sure to wait after start
371371
go func() {
372-
logger.Debug("waiting on shim process")
373-
waitErr := cmd.Wait()
374-
logger.WithError(waitErr).Debug("completed waiting on shim process")
372+
if err := cmd.Wait(); err != nil {
373+
if exitErr, ok := err.(*exec.ExitError); ok {
374+
// shim is usually terminated by cancelling the context
375+
logger.WithError(exitErr).Debug("shim has been terminated")
376+
} else {
377+
logger.WithError(err).Error("shim has been unexpectedly terminated")
378+
}
379+
}
380+
if err := os.RemoveAll(shimDir.RootPath()); err != nil {
381+
logger.WithError(err).Errorf("failed to remove %q", shimDir.RootPath())
382+
}
375383
}()
376384

377385
err = setShimOOMScore(cmd.Process.Pid)

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ require (
1818
github.com/docker/go-events v0.0.0-20170721190031-9461782956ad // indirect
1919
github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82 // indirect
2020
github.com/docker/go-units v0.3.3
21-
github.com/firecracker-microvm/firecracker-go-sdk v0.17.1-0.20191029213755-dbf9a1e05f09
21+
github.com/firecracker-microvm/firecracker-go-sdk v0.19.0
2222
github.com/go-ole/go-ole v1.2.4 // indirect
2323
github.com/godbus/dbus v0.0.0-20181025153459-66d97aec3384 // indirect
2424
github.com/gofrs/uuid v3.2.0+incompatible

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82 h1:X0fj836zx99zF
5959
github.com/docker/go-metrics v0.0.0-20181218153428-b84716841b82/go.mod h1:/u0gXw0Gay3ceNrsHubL3BtdOL2fHf93USgMTe0W5dI=
6060
github.com/docker/go-units v0.3.3 h1:Xk8S3Xj5sLGlG5g67hJmYMmUgXv5N4PhkjJHHqrwnTk=
6161
github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
62-
github.com/firecracker-microvm/firecracker-go-sdk v0.17.1-0.20191029213755-dbf9a1e05f09 h1:JDfRpK+V2J1Es+Xm6aMJjCqvA4xv1kuWnJfeSopyDwo=
63-
github.com/firecracker-microvm/firecracker-go-sdk v0.17.1-0.20191029213755-dbf9a1e05f09/go.mod h1:tVXziw7GjioCKVjI5/agymYxUaqJM6q7cp9e6kwjo8Q=
62+
github.com/firecracker-microvm/firecracker-go-sdk v0.19.0 h1:Fgb3WhB4q3J5e+ksMBtjVwBTvbDtPAkx7eQulb2BOq8=
63+
github.com/firecracker-microvm/firecracker-go-sdk v0.19.0/go.mod h1:kW0gxvPpPvMukUxxTO9DrpSlScrtrTDGY3VgjAj/Qwc=
6464
github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb h1:D4uzjWwKYQ5XnAvUbuvHW93esHg7F8N/OYeBBcJoTr0=
6565
github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q=
6666
github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI=

runtime/service.go

Lines changed: 62 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"math"
2020
"net"
2121
"os"
22+
"os/exec"
2223
"runtime/debug"
2324
"strconv"
2425
"strings"
@@ -45,7 +46,6 @@ import (
4546
"github.com/gofrs/uuid"
4647
ptypes "github.com/gogo/protobuf/types"
4748
"github.com/golang/protobuf/ptypes/empty"
48-
"github.com/hashicorp/go-multierror"
4949
"github.com/pkg/errors"
5050
"github.com/sirupsen/logrus"
5151
"golang.org/x/sys/unix"
@@ -71,6 +71,7 @@ const (
7171
minVsockIOPort = uint32(11000)
7272
firecrackerStartTimeout = 5 * time.Second
7373
defaultStopVMTimeout = 5 * time.Second
74+
defaultShutdownTimeout = 5 * time.Second
7475

7576
// StartEventName is the topic published to when a VM starts
7677
StartEventName = "/firecracker-vm/start"
@@ -548,46 +549,31 @@ func (s *service) mountDrives(requestCtx context.Context, driveMounts []*proto.F
548549
return nil
549550
}
550551

551-
// StopVM will shutdown the firecracker VM and start this shim's shutdown procedure. If the VM has not been
552-
// created yet and the timeout is hit waiting for it to exist, an error will be returned but the shim will
553-
// continue to shutdown.
552+
// StopVM will shutdown the VMM. Unlike Shutdown, this method is exposed to containerd clients.
553+
// If the VM has not been created yet and the timeout is hit waiting for it to exist, an error will be returned
554+
// but the shim will continue to shutdown.
554555
func (s *service) StopVM(requestCtx context.Context, request *proto.StopVMRequest) (_ *empty.Empty, err error) {
556+
defer logPanicAndDie(s.logger)
557+
s.logger.WithFields(logrus.Fields{"timeout_seconds": request.TimeoutSeconds}).Debug("StopVM")
558+
555559
timeout := defaultStopVMTimeout
556560
if request.TimeoutSeconds > 0 {
557561
timeout = time.Duration(request.TimeoutSeconds) * time.Second
558562
}
559-
timer := time.NewTimer(timeout)
560-
defer logPanicAndDie(s.logger)
561-
// If something goes wrong here, just shut down ungracefully. This eliminates some scenarios that would result
562-
// in the user being unable to shut down the VM.
563-
defer func() {
564-
if err != nil {
565-
s.logger.WithError(err).Error("StopVM error, shim is shutting down ungracefully")
566-
s.shimCancel()
567-
}
568-
}()
569563

564+
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), timeout)
565+
defer shutdownCancel()
566+
567+
defer s.shimCancel()
570568
err = s.waitVMReady()
571569
if err != nil {
572570
return nil, err
573571
}
574572

575-
shutdownCh := make(chan error)
576-
go func() {
577-
defer close(shutdownCh)
578-
_, err := s.Shutdown(requestCtx, &taskAPI.ShutdownRequest{Now: true})
579-
shutdownCh <- err
580-
}()
581-
582-
select {
583-
case <-timer.C:
584-
return nil, status.Error(codes.DeadlineExceeded, "timed out waiting for VM shutdown")
585-
case err = <-shutdownCh:
586-
if err != nil {
587-
return nil, err
588-
}
589-
return &empty.Empty{}, nil
573+
if err = s.shutdown(requestCtx, shutdownCtx, &taskAPI.ShutdownRequest{Now: true}); err != nil {
574+
return nil, err
590575
}
576+
return &empty.Empty{}, nil
591577
}
592578

593579
// GetVMInfo returns metadata for the VM being managed by this shim. If the VM has not been created yet, this
@@ -1149,54 +1135,71 @@ func (s *service) Connect(requestCtx context.Context, req *taskAPI.ConnectReques
11491135
return nil, errdefs.ErrNotImplemented
11501136
}
11511137

1152-
// Shutdown will attempt a graceful shutdown of the shim+VM. The shutdown procedure will only actually take
1153-
// place if "Now" was set to true OR the VM started successfully, all tasks have been deleted and we were
1154-
// told to shutdown when all tasks were deleted. Otherwise the call is just ignored.
1138+
// Shutdown will shutdown of the VMM. Unlike StopVM, this method is only exposed to containerd itself.
11551139
//
1156-
// Shutdown can be called from a few code paths:
1157-
// * If StopVM is called by the user (in which case "Now" is set to true)
1158-
// * After any task is deleted via containerd's API (containerd calls on behalf of the user)
1159-
// * After any task Create call returns an error (containerd calls on behalf of the user)
1160-
// Shutdown is not directly exposed to containerd clients.
1140+
// The shutdown procedure will only actually take place if "Now" was set to true OR
1141+
// the VM started successfully, all tasks have been deleted and we were told to shutdown when all tasks were deleted.
1142+
// Otherwise the call is just ignored.
1143+
//
1144+
// containerd calls this API on behalf of the user in the following cases:
1145+
// * After any task is deleted via containerd's API
1146+
// * After any task Create call returns an error
11611147
func (s *service) Shutdown(requestCtx context.Context, req *taskAPI.ShutdownRequest) (*ptypes.Empty, error) {
11621148
defer logPanicAndDie(log.G(requestCtx))
1163-
s.logger.WithFields(logrus.Fields{"task_id": req.ID, "now": req.Now}).Debug("shutdown")
1149+
s.logger.WithFields(logrus.Fields{"task_id": req.ID, "now": req.Now}).Debug("Shutdown")
11641150

11651151
shouldShutdown := req.Now || s.exitAfterAllTasksDeleted && s.taskManager.ShutdownIfEmpty()
11661152
if !shouldShutdown {
11671153
return &ptypes.Empty{}, nil
11681154
}
11691155

1170-
// cancel the shim context no matter what, which will result in the VM getting a SIGKILL (if not already
1171-
// dead from graceful shutdown) and the shim process itself to begin exiting
1172-
defer s.shimCancel()
1156+
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), defaultShutdownTimeout*time.Second)
1157+
defer shutdownCancel()
11731158

1174-
s.logger.Info("stopping the VM")
1159+
if err := s.shutdown(requestCtx, shutdownCtx, req); err != nil {
1160+
return &ptypes.Empty{}, err
1161+
}
11751162

1176-
var shutdownErr error
1163+
return &ptypes.Empty{}, nil
1164+
}
11771165

1178-
_, err := s.agentClient.Shutdown(requestCtx, req)
1179-
if err != nil {
1180-
shutdownErr = multierror.Append(shutdownErr, errors.Wrap(err, "failed to shutdown VM Agent"))
1181-
}
1166+
// shutdown will shutdown the VMM. requestCtx is used to issue requests and shutdownCtx is used for waiting a graceful shutdown procedure.
1167+
// When shutdownCtx's Done channel is closed during the graceful shutdown procedure, the VMM is forcefully killed.
1168+
func (s *service) shutdown(requestCtx, shutdownCtx context.Context, req *taskAPI.ShutdownRequest) error {
1169+
// cancel the shim at the end, in case even the last StopVMM() doesn't work
1170+
defer s.shimCancel()
11821171

1183-
err = s.machine.StopVMM()
1184-
if err != nil {
1185-
shutdownErr = multierror.Append(shutdownErr, errors.Wrap(err, "failed to gracefully stop VM"))
1172+
s.logger.Info("stopping the VM")
1173+
if _, err := s.agentClient.Shutdown(requestCtx, req); err != nil {
1174+
return err
11861175
}
11871176

1188-
err = os.RemoveAll(s.shimDir.RootPath())
1189-
if err != nil {
1190-
shutdownErr = multierror.Append(shutdownErr, errors.Wrapf(err, "failed to remove VM dir %q during shutdown", s.shimDir.RootPath()))
1191-
}
1177+
shutdownCh := make(chan error)
1178+
go func() {
1179+
defer close(shutdownCh)
1180+
err := s.machine.Wait(shutdownCtx)
1181+
shutdownCh <- err
1182+
}()
11921183

1193-
if shutdownErr != nil {
1194-
s.logger.WithError(shutdownErr).Error()
1195-
return nil, shutdownErr
1184+
var shutdownErr error
1185+
select {
1186+
case shutdownErr = <-shutdownCh:
1187+
if shutdownErr == nil {
1188+
s.logger.Info("the VM has been stopped successfully")
1189+
return nil
1190+
} else if exitErr, ok := shutdownErr.(*exec.ExitError); ok {
1191+
s.logger.WithError(exitErr).Error("the VM has been stopped, but not successfully")
1192+
return exitErr
1193+
}
1194+
s.logger.WithError(shutdownErr).Error("the VM returns unknown error")
1195+
case <-shutdownCtx.Done():
1196+
s.logger.Error("the VM hasn't been stopped before the context's deadline")
11961197
}
11971198

1198-
s.logger.Info("successfully stopped the VM")
1199-
return &ptypes.Empty{}, nil
1199+
if err := s.machine.StopVMM(); err != nil {
1200+
return err
1201+
}
1202+
return shutdownErr
12001203
}
12011204

12021205
func (s *service) Stats(requestCtx context.Context, req *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {

runtime/service_integ_test.go

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -140,16 +140,7 @@ func TestShimExitsUponContainerDelete_Isolated(t *testing.T) {
140140
err = task.Start(testCtx)
141141
require.NoError(t, err, "failed to start task for container %s", containerName)
142142

143-
shimProcesses, err := internal.WaitForProcessToExist(testCtx, time.Second,
144-
func(ctx context.Context, p *process.Process) (bool, error) {
145-
processExecutable, err := p.ExeWithContext(ctx)
146-
if err != nil {
147-
return false, err
148-
}
149-
150-
return filepath.Base(processExecutable) == shimProcessName, nil
151-
},
152-
)
143+
shimProcesses, err := internal.WaitForProcessToExist(testCtx, time.Second, findShim)
153144
require.NoError(t, err, "failed waiting for expected shim process %q to come up", shimProcessName)
154145
require.Len(t, shimProcesses, 1, "expected only one shim process to exist")
155146
shimProcess := shimProcesses[0]
@@ -1228,3 +1219,58 @@ func TestRandomness_Isolated(t *testing.T) {
12281219
}
12291220
}
12301221
}
1222+
1223+
func TestStopVM_Isolated(t *testing.T) {
1224+
prepareIntegTest(t)
1225+
require := require.New(t)
1226+
1227+
client, err := containerd.New(containerdSockPath, containerd.WithDefaultRuntime(firecrackerRuntime))
1228+
require.NoError(err, "unable to create client to containerd service at %s, is containerd running?", containerdSockPath)
1229+
defer client.Close()
1230+
1231+
timeout, cancel := context.WithTimeout(context.Background(), 60*time.Second)
1232+
defer cancel()
1233+
ctx := namespaces.WithNamespace(timeout, "default")
1234+
1235+
image, err := alpineImage(ctx, client, defaultSnapshotterName())
1236+
require.NoError(err, "failed to get alpine image")
1237+
1238+
pluginClient, err := ttrpcutil.NewClient(containerdSockPath + ".ttrpc")
1239+
require.NoError(err, "failed to create ttrpc client")
1240+
1241+
vmID := testNameToVMID(t.Name())
1242+
1243+
fcClient := fccontrol.NewFirecrackerClient(pluginClient.Client())
1244+
_, err = fcClient.CreateVM(ctx, &proto.CreateVMRequest{VMID: vmID})
1245+
require.NoError(err)
1246+
1247+
c, err := client.NewContainer(ctx,
1248+
"container",
1249+
containerd.WithSnapshotter(defaultSnapshotterName()),
1250+
containerd.WithNewSnapshot("snapshot", image),
1251+
containerd.WithNewSpec(oci.WithProcessArgs("/bin/echo", "-n", "hello"), firecrackeroci.WithVMID(vmID)),
1252+
)
1253+
require.NoError(err)
1254+
1255+
stdout := startAndWaitTask(ctx, t, c)
1256+
require.Equal("hello", stdout)
1257+
1258+
shimProcesses, err := internal.WaitForProcessToExist(ctx, time.Second, findShim)
1259+
require.NoError(err, "failed waiting for expected shim process %q to come up", shimProcessName)
1260+
require.Len(shimProcesses, 1, "expected only one shim process to exist")
1261+
1262+
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: vmID})
1263+
require.NoError(err)
1264+
1265+
err = internal.WaitForPidToExit(ctx, time.Second, shimProcesses[0].Pid)
1266+
require.NoError(err, "shim hasn't been terminated")
1267+
}
1268+
1269+
func findShim(ctx context.Context, p *process.Process) (bool, error) {
1270+
processExecutable, err := p.ExeWithContext(ctx)
1271+
if err != nil {
1272+
return false, err
1273+
}
1274+
1275+
return filepath.Base(processExecutable) == shimProcessName, nil
1276+
}

tools/image-builder/Dockerfile.debian-image

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# express or implied. See the License for the specific language governing
1212
# permissions and limitations under the License.
1313

14-
FROM debian:stretch-slim
14+
FROM debian:buster-slim
1515

1616
RUN apt-get update && \
1717
DEBIAN_FRONTEND=noninteractive apt-get -y install \

tools/image-builder/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ endif
5656
debootstrap \
5757
--variant=minbase \
5858
--include=udev,systemd,systemd-sysv,procps,libseccomp2,haveged \
59-
stretch \
59+
buster \
6060
"$(WORKDIR)" $(DEBMIRROR)
6161
rm -rf "$(WORKDIR)/var/cache/apt/archives" \
6262
"$(WORKDIR)/usr/share/doc" \

tools/image-builder/files_debootstrap/etc/systemd/system/firecracker-agent.service

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ Description=Firecracker VM agent
1515
StartLimitIntervalSec=2
1616
After=local-fs.target
1717
Requires=local-fs.target
18+
SuccessAction=reboot
1819

1920
[Service]
2021
Type=simple
2122
WorkingDirectory=/container
2223
ExecStart=/usr/local/bin/agent
23-
Restart=always

tools/image-builder/files_debootstrap/sbin/overlay-init

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,4 @@ do_overlay
5959

6060
# invoke the actual system init program and procede with the boot
6161
# process.
62-
exec /sbin/init $@
62+
exec /usr/sbin/init $@

0 commit comments

Comments
 (0)