diff --git a/pkg/shim/v1/utils/BUILD b/pkg/shim/v1/utils/BUILD index a06802e076..b97921c3f4 100644 --- a/pkg/shim/v1/utils/BUILD +++ b/pkg/shim/v1/utils/BUILD @@ -17,6 +17,7 @@ go_library( ], deps = [ "//runsc/specutils", + "@com_github_containerd_log//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) diff --git a/pkg/shim/v1/utils/volumes.go b/pkg/shim/v1/utils/volumes.go index cc575263a6..12f97e0ec9 100644 --- a/pkg/shim/v1/utils/volumes.go +++ b/pkg/shim/v1/utils/volumes.go @@ -16,9 +16,12 @@ package utils import ( "fmt" + "io" + "os" "path/filepath" "strings" + "github.com/containerd/log" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/specutils" ) @@ -81,6 +84,11 @@ func volumeSourceKey(volume string) string { return volumeKeyPrefix + volume + ".source" } +// volumeShareKey constructs the annotation key for volume share type. +func volumeShareKey(volume string) string { + return volumeKeyPrefix + volume + ".share" +} + // volumePath searches the volume path in the kubelet pod directory. func volumePath(volume, uid string) (string, error) { // TODO: Support subpath when gvisor supports pod volume bind mount. @@ -121,17 +129,12 @@ func isVolumePath(volume, path string) (bool, error) { // runsc should use these two setting to infer EmptyDir medium: // - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir // - tmpfs annotation type + bind mount type = disk-backed EmptyDir +// +// NOTE(b/416567832): Some CSI drivers (like GCS FUSE driver) use EmptyDirs to +// communicate with the Pod over a UDS. While not foolproof, we detect such +// EmptyDirs by checking if the host directory is not empty and turn off the +// EmptyDir optimization for them by configuring them as normal bind mounts. func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { - var uid string - if IsSandbox(s) { - var err error - uid, err = podUID(s) - if err != nil { - // Skip if we can't get pod UID, because this doesn't work - // for containerd 1.1. - return false, nil - } - } updated := false for k, v := range s.Annotations { if !isVolumeKey(k) { @@ -141,34 +144,51 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { continue } volume := volumeName(k) - if uid != "" { + if IsSandbox(s) { // This is the root (first) container. Mount annotations are only // consumed from this container's spec. So fix mount annotations by: // 1. Adding source annotation. // 2. Fixing type annotation. + uid, err := podUID(s) + if err != nil { + // Skip if we can't get pod UID, because this doesn't work + // for containerd 1.1. + return false, nil + } path, err := volumePath(volume, uid) if err != nil { return false, fmt.Errorf("get volume path for %q: %w", volume, err) } s.Annotations[volumeSourceKey(volume)] = path if strings.Contains(path, emptyDirVolumesDir) { - s.Annotations[k] = "tmpfs" // See note about EmptyDir. + if isDirEmpty(path) { + s.Annotations[k] = "tmpfs" // See note about EmptyDir. + } else { + // This is a non-empty EmptyDir volume. Configure it as a bind mount. + log.L.Infof("Non-empty EmptyDir volume %q, configuring bind mount annotations", volume) + s.Annotations[k] = "bind" + s.Annotations[volumeShareKey(volume)] = "shared" + } } updated = true } else { // This is a sub-container. Mount annotations are ignored. So no need to - // bother fixing those. + // bother fixing those. An error is returned for sandbox if source + // annotation is not successfully applied, so it is guaranteed that the + // source annotation for sandbox has already been successfully applied at + // this point. Update mount type in spec.Mounts if required. for i := range s.Mounts { - // An error is returned for sandbox if source annotation is not - // successfully applied, so it is guaranteed that the source annotation - // for sandbox has already been successfully applied at this point. - // // The volume name is unique inside a pod, so matching without podUID // is fine here. // // TODO: Pass podUID down to shim for containers to do more accurate // matching. if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { + if strings.Contains(s.Mounts[i].Source, emptyDirVolumesDir) && !isDirEmpty(s.Mounts[i].Source) { + // This is a non-empty EmptyDir volume. Don't change the mount type. + log.L.Infof("Non-empty EmptyDir volume %q, not changing its mount type", volume) + continue + } // Container mount type must match the mount type specified by // admission controller. See note about EmptyDir. specutils.ChangeMountType(&s.Mounts[i], v) @@ -187,6 +207,21 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { return updated, nil } +func isDirEmpty(path string) bool { + f, err := os.Open(path) + if err != nil { + log.L.Warningf("failed to open %q to check if it is empty: %v", path, err) + return true + } + defer f.Close() + + _, err = f.Readdirnames(1) + if err == io.EOF { + return true + } + return false +} + // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs // mount inside containers. // diff --git a/pkg/shim/v1/utils/volumes_test.go b/pkg/shim/v1/utils/volumes_test.go index f499dbbf30..3799a048d0 100644 --- a/pkg/shim/v1/utils/volumes_test.go +++ b/pkg/shim/v1/utils/volumes_test.go @@ -32,17 +32,25 @@ func TestUpdateVolumeAnnotations(t *testing.T) { kubeletPodsDir = dir const ( - testPodUID = "testuid" - testVolumeName = "testvolume" - testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID - testLegacyLogDirPath = "/var/log/pods/" + testPodUID + testPodUID = "testuid" + testVolumeName = "testvolume" + testNonEmptyVolumeName = "nonemptyvolume" + testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID + testLegacyLogDirPath = "/var/log/pods/" + testPodUID ) testVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testVolumeName) - if err := os.MkdirAll(testVolumePath, 0755); err != nil { t.Fatalf("Create test volume: %v", err) } + testNonEmptyVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testNonEmptyVolumeName) + if err := os.MkdirAll(testNonEmptyVolumePath, 0755); err != nil { + t.Fatalf("Create test volume: %v", err) + } + if err := os.WriteFile(testNonEmptyVolumePath+"/file", []byte("hello"), 0644); err != nil { + t.Fatalf("Create test volume: %v", err) + } + for _, test := range []struct { name string spec *specs.Spec @@ -144,6 +152,87 @@ func TestUpdateVolumeAnnotations(t *testing.T) { }, expectUpdate: true, }, + { + name: "non-empty volume for sandbox", + spec: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "shared", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "bind", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + volumeKeyPrefix + testNonEmptyVolumeName + ".source": testNonEmptyVolumePath, + }, + }, + expectUpdate: true, + }, + { + name: "non-empty volume for container", + spec: &specs.Spec{ + Mounts: []specs.Mount{ + { + Destination: "/test", + Type: "bind", + Source: testNonEmptyVolumePath, + Options: []string{"ro"}, + }, + }, + Annotations: map[string]string{ + ContainerTypeAnnotation: ContainerTypeContainer, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Mounts: []specs.Mount{ + { + Destination: "/test", + Type: "bind", + Source: testNonEmptyVolumePath, + Options: []string{"ro"}, + }, + }, + Annotations: map[string]string{ + ContainerTypeAnnotation: ContainerTypeContainer, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + }, + { + name: "bind: volume annotations for sandbox", + spec: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testVolumeName + ".share": "container", + volumeKeyPrefix + testVolumeName + ".type": "bind", + volumeKeyPrefix + testVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testVolumeName + ".share": "container", + volumeKeyPrefix + testVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testVolumeName + ".options": "ro", + volumeKeyPrefix + testVolumeName + ".source": testVolumePath, + }, + }, + expectUpdate: true, + }, { name: "bind: volume annotations for container", spec: &specs.Spec{ diff --git a/runsc/boot/mount_hints.go b/runsc/boot/mount_hints.go index c8e428cdb5..3b1090e871 100644 --- a/runsc/boot/mount_hints.go +++ b/runsc/boot/mount_hints.go @@ -185,6 +185,12 @@ func (m *MountHint) ShouldShareMount() bool { (m.Share == container || m.Share == pod) } +// IsSandboxLocal returns true if this mount is only used by the sandbox and +// has no external observers. +func (m *MountHint) IsSandboxLocal() bool { + return m.Share == container || m.Share == pod +} + // checkCompatible verifies that shared mount is compatible with master. // Master options must be the same or less restrictive than the container mount, // e.g. master can be 'rw' while container mounts as 'ro'. diff --git a/runsc/container/container.go b/runsc/container/container.go index d480e1e511..093e9cf942 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -965,7 +965,7 @@ func (c *Container) initGoferConfs(ovlConf config.Overlay2, mountHints *boot.Pod if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) { overlayMedium = config.NoOverlay } - if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil { + if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil && hint.IsSandboxLocal() { // Note that we want overlayMedium=self even if this is a read-only mount so that // the shared mount is created correctly. Future containers may mount this writably. overlayMedium = config.SelfOverlay