From 797dcc5169dca1976bdcfd1f20d9fa8c6c0d36cc Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 14 May 2025 11:31:09 -0700 Subject: [PATCH] Handle non-empty EmptyDirs used by GCS Fuse CSI Driver. Some CSI drivers, like GCS Fuse CSI driver, inject EmptyDirs into sidecar containers and communicate with the container using files in the EmptyDir. In gVisor terminology, such an EmptyDir is being used as a shared bind (gofer) mount. It is not exclusive to the sandbox. This breaks a fundamental assumption gVisor makes about EmptyDirs; it assumes that they are exclusive to the sandbox and that it has no external observers. So as an optimization, gVisor converts EmptyDir volumes into gVisor-internal tmpfs filesystems that are mounted into all the containers that are using that EmptyDir. As a result: - Any files in the host EmptyDir directory is not reflected within the sandbox. - Any changes made by the sandbox in the EmptyDir are not reflcted on the host. This change uses the heuristic that if the EmptyDir volume's host directory is not empty at sandbox creation time, then it is being shared with some external component which is interacting with the sandbox. We have observed that the GCS Fuse CSI Driver populates the /gcsfuse-tmp EmptyDir with a UDS at path `.volumes/gcsfuse-mount/socket`. PiperOrigin-RevId: 758765841 --- pkg/shim/v1/utils/BUILD | 1 + pkg/shim/v1/utils/volumes.go | 69 +++++++++++++++------ pkg/shim/v1/utils/volumes_test.go | 99 +++++++++++++++++++++++++++++-- runsc/boot/mount_hints.go | 6 ++ runsc/container/container.go | 2 +- 5 files changed, 154 insertions(+), 23 deletions(-) diff --git a/pkg/shim/v1/utils/BUILD b/pkg/shim/v1/utils/BUILD index a06802e076..b97921c3f4 100644 --- a/pkg/shim/v1/utils/BUILD +++ b/pkg/shim/v1/utils/BUILD @@ -17,6 +17,7 @@ go_library( ], deps = [ "//runsc/specutils", + "@com_github_containerd_log//:go_default_library", "@com_github_opencontainers_runtime_spec//specs-go:go_default_library", ], ) diff --git a/pkg/shim/v1/utils/volumes.go b/pkg/shim/v1/utils/volumes.go index cc575263a6..12f97e0ec9 100644 --- a/pkg/shim/v1/utils/volumes.go +++ b/pkg/shim/v1/utils/volumes.go @@ -16,9 +16,12 @@ package utils import ( "fmt" + "io" + "os" "path/filepath" "strings" + "github.com/containerd/log" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/runsc/specutils" ) @@ -81,6 +84,11 @@ func volumeSourceKey(volume string) string { return volumeKeyPrefix + volume + ".source" } +// volumeShareKey constructs the annotation key for volume share type. +func volumeShareKey(volume string) string { + return volumeKeyPrefix + volume + ".share" +} + // volumePath searches the volume path in the kubelet pod directory. func volumePath(volume, uid string) (string, error) { // TODO: Support subpath when gvisor supports pod volume bind mount. @@ -121,17 +129,12 @@ func isVolumePath(volume, path string) (bool, error) { // runsc should use these two setting to infer EmptyDir medium: // - tmpfs annotation type + tmpfs mount type = memory-backed EmptyDir // - tmpfs annotation type + bind mount type = disk-backed EmptyDir +// +// NOTE(b/416567832): Some CSI drivers (like GCS FUSE driver) use EmptyDirs to +// communicate with the Pod over a UDS. While not foolproof, we detect such +// EmptyDirs by checking if the host directory is not empty and turn off the +// EmptyDir optimization for them by configuring them as normal bind mounts. func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { - var uid string - if IsSandbox(s) { - var err error - uid, err = podUID(s) - if err != nil { - // Skip if we can't get pod UID, because this doesn't work - // for containerd 1.1. - return false, nil - } - } updated := false for k, v := range s.Annotations { if !isVolumeKey(k) { @@ -141,34 +144,51 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { continue } volume := volumeName(k) - if uid != "" { + if IsSandbox(s) { // This is the root (first) container. Mount annotations are only // consumed from this container's spec. So fix mount annotations by: // 1. Adding source annotation. // 2. Fixing type annotation. + uid, err := podUID(s) + if err != nil { + // Skip if we can't get pod UID, because this doesn't work + // for containerd 1.1. + return false, nil + } path, err := volumePath(volume, uid) if err != nil { return false, fmt.Errorf("get volume path for %q: %w", volume, err) } s.Annotations[volumeSourceKey(volume)] = path if strings.Contains(path, emptyDirVolumesDir) { - s.Annotations[k] = "tmpfs" // See note about EmptyDir. + if isDirEmpty(path) { + s.Annotations[k] = "tmpfs" // See note about EmptyDir. + } else { + // This is a non-empty EmptyDir volume. Configure it as a bind mount. + log.L.Infof("Non-empty EmptyDir volume %q, configuring bind mount annotations", volume) + s.Annotations[k] = "bind" + s.Annotations[volumeShareKey(volume)] = "shared" + } } updated = true } else { // This is a sub-container. Mount annotations are ignored. So no need to - // bother fixing those. + // bother fixing those. An error is returned for sandbox if source + // annotation is not successfully applied, so it is guaranteed that the + // source annotation for sandbox has already been successfully applied at + // this point. Update mount type in spec.Mounts if required. for i := range s.Mounts { - // An error is returned for sandbox if source annotation is not - // successfully applied, so it is guaranteed that the source annotation - // for sandbox has already been successfully applied at this point. - // // The volume name is unique inside a pod, so matching without podUID // is fine here. // // TODO: Pass podUID down to shim for containers to do more accurate // matching. if yes, _ := isVolumePath(volume, s.Mounts[i].Source); yes { + if strings.Contains(s.Mounts[i].Source, emptyDirVolumesDir) && !isDirEmpty(s.Mounts[i].Source) { + // This is a non-empty EmptyDir volume. Don't change the mount type. + log.L.Infof("Non-empty EmptyDir volume %q, not changing its mount type", volume) + continue + } // Container mount type must match the mount type specified by // admission controller. See note about EmptyDir. specutils.ChangeMountType(&s.Mounts[i], v) @@ -187,6 +207,21 @@ func UpdateVolumeAnnotations(s *specs.Spec) (bool, error) { return updated, nil } +func isDirEmpty(path string) bool { + f, err := os.Open(path) + if err != nil { + log.L.Warningf("failed to open %q to check if it is empty: %v", path, err) + return true + } + defer f.Close() + + _, err = f.Readdirnames(1) + if err == io.EOF { + return true + } + return false +} + // configureShm sets up annotations to mount /dev/shm as a pod shared tmpfs // mount inside containers. // diff --git a/pkg/shim/v1/utils/volumes_test.go b/pkg/shim/v1/utils/volumes_test.go index f499dbbf30..3799a048d0 100644 --- a/pkg/shim/v1/utils/volumes_test.go +++ b/pkg/shim/v1/utils/volumes_test.go @@ -32,17 +32,25 @@ func TestUpdateVolumeAnnotations(t *testing.T) { kubeletPodsDir = dir const ( - testPodUID = "testuid" - testVolumeName = "testvolume" - testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID - testLegacyLogDirPath = "/var/log/pods/" + testPodUID + testPodUID = "testuid" + testVolumeName = "testvolume" + testNonEmptyVolumeName = "nonemptyvolume" + testLogDirPath = "/var/log/pods/testns_testname_" + testPodUID + testLegacyLogDirPath = "/var/log/pods/" + testPodUID ) testVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testVolumeName) - if err := os.MkdirAll(testVolumePath, 0755); err != nil { t.Fatalf("Create test volume: %v", err) } + testNonEmptyVolumePath := fmt.Sprintf("%s/%s/volumes/%s/%s", dir, testPodUID, emptyDirVolumesDir, testNonEmptyVolumeName) + if err := os.MkdirAll(testNonEmptyVolumePath, 0755); err != nil { + t.Fatalf("Create test volume: %v", err) + } + if err := os.WriteFile(testNonEmptyVolumePath+"/file", []byte("hello"), 0644); err != nil { + t.Fatalf("Create test volume: %v", err) + } + for _, test := range []struct { name string spec *specs.Spec @@ -144,6 +152,87 @@ func TestUpdateVolumeAnnotations(t *testing.T) { }, expectUpdate: true, }, + { + name: "non-empty volume for sandbox", + spec: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "shared", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "bind", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + volumeKeyPrefix + testNonEmptyVolumeName + ".source": testNonEmptyVolumePath, + }, + }, + expectUpdate: true, + }, + { + name: "non-empty volume for container", + spec: &specs.Spec{ + Mounts: []specs.Mount{ + { + Destination: "/test", + Type: "bind", + Source: testNonEmptyVolumePath, + Options: []string{"ro"}, + }, + }, + Annotations: map[string]string{ + ContainerTypeAnnotation: ContainerTypeContainer, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Mounts: []specs.Mount{ + { + Destination: "/test", + Type: "bind", + Source: testNonEmptyVolumePath, + Options: []string{"ro"}, + }, + }, + Annotations: map[string]string{ + ContainerTypeAnnotation: ContainerTypeContainer, + volumeKeyPrefix + testNonEmptyVolumeName + ".share": "pod", + volumeKeyPrefix + testNonEmptyVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testNonEmptyVolumeName + ".options": "ro", + }, + }, + }, + { + name: "bind: volume annotations for sandbox", + spec: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testVolumeName + ".share": "container", + volumeKeyPrefix + testVolumeName + ".type": "bind", + volumeKeyPrefix + testVolumeName + ".options": "ro", + }, + }, + expected: &specs.Spec{ + Annotations: map[string]string{ + sandboxLogDirAnnotation: testLogDirPath, + ContainerTypeAnnotation: containerTypeSandbox, + volumeKeyPrefix + testVolumeName + ".share": "container", + volumeKeyPrefix + testVolumeName + ".type": "tmpfs", + volumeKeyPrefix + testVolumeName + ".options": "ro", + volumeKeyPrefix + testVolumeName + ".source": testVolumePath, + }, + }, + expectUpdate: true, + }, { name: "bind: volume annotations for container", spec: &specs.Spec{ diff --git a/runsc/boot/mount_hints.go b/runsc/boot/mount_hints.go index c8e428cdb5..3b1090e871 100644 --- a/runsc/boot/mount_hints.go +++ b/runsc/boot/mount_hints.go @@ -185,6 +185,12 @@ func (m *MountHint) ShouldShareMount() bool { (m.Share == container || m.Share == pod) } +// IsSandboxLocal returns true if this mount is only used by the sandbox and +// has no external observers. +func (m *MountHint) IsSandboxLocal() bool { + return m.Share == container || m.Share == pod +} + // checkCompatible verifies that shared mount is compatible with master. // Master options must be the same or less restrictive than the container mount, // e.g. master can be 'rw' while container mounts as 'ro'. diff --git a/runsc/container/container.go b/runsc/container/container.go index d480e1e511..093e9cf942 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -965,7 +965,7 @@ func (c *Container) initGoferConfs(ovlConf config.Overlay2, mountHints *boot.Pod if specutils.IsReadonlyMount(c.Spec.Mounts[i].Options) { overlayMedium = config.NoOverlay } - if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil { + if hint := mountHints.FindMount(c.Spec.Mounts[i].Source); hint != nil && hint.IsSandboxLocal() { // Note that we want overlayMedium=self even if this is a read-only mount so that // the shared mount is created correctly. Future containers may mount this writably. overlayMedium = config.SelfOverlay