Skip to content

Commit 00f03db

Browse files
avagingvisor-bot
authored andcommitted
Add support of rootless containers
* support podmand rootless containers * support docker rootless containers Fixes #311 PiperOrigin-RevId: 458573377
1 parent dc7675b commit 00f03db

File tree

7 files changed

+242
-21
lines changed

7 files changed

+242
-21
lines changed

.buildkite/pipeline.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,12 @@ steps:
267267
agents:
268268
cgroup: "v2"
269269
os: "ubuntu"
270+
- <<: *common
271+
label: ":podman: Podman"
272+
command: sudo ./test/podman/run.sh
273+
agents:
274+
cgroup: "v2"
275+
os: "ubuntu"
270276

271277
# Check the website builds.
272278
- <<: *common

runsc/cmd/do.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ type Do struct {
4848
ip string
4949
quiet bool
5050
overlay bool
51+
uidMap idMapSlice
52+
gidMap idMapSlice
5153
}
5254

5355
// Name implements subcommands.Command.Name.
@@ -72,13 +74,53 @@ used for testing only.
7274
`
7375
}
7476

77+
type idMapSlice []specs.LinuxIDMapping
78+
79+
// String implements flag.Value.String.
80+
func (is *idMapSlice) String() string {
81+
return fmt.Sprintf("%#v", is)
82+
}
83+
84+
// Get implements flag.Value.Get.
85+
func (is *idMapSlice) Get() interface{} {
86+
return is
87+
}
88+
89+
// Set implements flag.Value.Set.
90+
func (is *idMapSlice) Set(s string) error {
91+
fs := strings.Fields(s)
92+
if len(fs) != 3 {
93+
return fmt.Errorf("invalid mapping: %s", s)
94+
}
95+
var cid, hid, size int
96+
var err error
97+
if cid, err = strconv.Atoi(fs[0]); err != nil {
98+
return fmt.Errorf("invalid mapping: %s", s)
99+
}
100+
if hid, err = strconv.Atoi(fs[1]); err != nil {
101+
return fmt.Errorf("invalid mapping: %s", s)
102+
}
103+
if size, err = strconv.Atoi(fs[2]); err != nil {
104+
return fmt.Errorf("invalid mapping: %s", s)
105+
}
106+
m := specs.LinuxIDMapping{
107+
ContainerID: uint32(cid),
108+
HostID: uint32(hid),
109+
Size: uint32(size),
110+
}
111+
*is = append(*is, m)
112+
return nil
113+
}
114+
75115
// SetFlags implements subcommands.Command.SetFlags.
76116
func (c *Do) SetFlags(f *flag.FlagSet) {
77117
f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
78118
f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
79119
f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
80120
f.BoolVar(&c.quiet, "quiet", false, "suppress runsc messages to stdout. Application output is still sent to stdout and stderr")
81121
f.BoolVar(&c.overlay, "force-overlay", true, "use an overlay. WARNING: disabling gives the command write access to the host")
122+
f.Var(&c.uidMap, "uid-map", "Add a user id mapping [ContainerID, HostID, Size]")
123+
f.Var(&c.gidMap, "gid-map", "Add a group id mapping [ContainerID, HostID, Size]")
82124
}
83125

84126
// Execute implements subcommands.Command.Execute.
@@ -129,6 +171,12 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
129171

130172
cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
131173

174+
if c.uidMap != nil {
175+
addNamespace(spec, specs.LinuxNamespace{Type: specs.UserNamespace})
176+
spec.Linux.UIDMappings = c.uidMap
177+
spec.Linux.GIDMappings = c.gidMap
178+
}
179+
132180
if conf.Network == config.NetworkNone {
133181
addNamespace(spec, specs.LinuxNamespace{Type: specs.NetworkNamespace})
134182
} else if conf.Rootless {

runsc/cmd/gofer.go

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ import (
1818
"context"
1919
"encoding/json"
2020
"fmt"
21+
"io"
2122
"os"
2223
"path/filepath"
24+
"runtime"
2325
"runtime/debug"
2426
"strings"
2527

@@ -63,8 +65,9 @@ type Gofer struct {
6365
applyCaps bool
6466
setUpRoot bool
6567

66-
specFD int
67-
mountsFD int
68+
specFD int
69+
mountsFD int
70+
syncUsernsFD int
6871
}
6972

7073
// Name implements subcommands.Command.
@@ -92,6 +95,7 @@ func (g *Gofer) SetFlags(f *flag.FlagSet) {
9295
f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec")
9396
f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
9497
f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
98+
f.IntVar(&g.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.")
9599
}
96100

97101
// Execute implements subcommands.Command.
@@ -113,6 +117,26 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
113117
util.Fatalf("reading spec: %v", err)
114118
}
115119

120+
if g.syncUsernsFD >= 0 {
121+
f := os.NewFile(uintptr(g.syncUsernsFD), "sync FD")
122+
defer f.Close()
123+
var b [1]byte
124+
if n, err := f.Read(b[:]); n != 0 || err != io.EOF {
125+
util.Fatalf("failed to sync: %v: %v", n, err)
126+
}
127+
128+
f.Close()
129+
// SETUID changes UID on the current system thread, so we have
130+
// to re-execute current binary.
131+
runtime.LockOSThread()
132+
if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 {
133+
util.Fatalf("failed to set UID: %v", errno)
134+
}
135+
if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 {
136+
util.Fatalf("failed to set GID: %v", errno)
137+
}
138+
}
139+
116140
if g.setUpRoot {
117141
if err := setupRootFS(spec, conf); err != nil {
118142
util.Fatalf("Error setting up root FS: %v", err)
@@ -122,7 +146,7 @@ func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
122146
// Disable caps when calling myself again.
123147
// Note: minimal argument handling for the default case to keep it simple.
124148
args := os.Args
125-
args = append(args, "--apply-caps=false", "--setup-root=false")
149+
args = append(args, "--apply-caps=false", "--setup-root=false", "--sync-userns-fd=-1")
126150
util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps))
127151
panic("unreachable")
128152
}

runsc/container/container.go

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -971,15 +971,49 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu
971971
{Type: specs.UTSNamespace},
972972
}
973973

974+
rootlessEUID := unix.Getuid() != 0
975+
var syncFile *os.File
974976
// Setup any uid/gid mappings, and create or join the configured user
975977
// namespace so the gofer's view of the filesystem aligns with the
976978
// users in the sandbox.
977-
userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
978-
nss = append(nss, userNS...)
979-
specutils.SetUIDGIDMappings(cmd, spec)
980-
if len(userNS) != 0 {
981-
// We need to set UID and GID to have capabilities in a new user namespace.
982-
cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
979+
if !rootlessEUID {
980+
userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
981+
nss = append(nss, userNS...)
982+
specutils.SetUIDGIDMappings(cmd, spec)
983+
if len(userNS) != 0 {
984+
// We need to set UID and GID to have capabilities in a new user namespace.
985+
cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
986+
}
987+
} else {
988+
userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec)
989+
if len(userNS) == 0 {
990+
return nil, nil, fmt.Errorf("unable to run a rootless container without userns")
991+
}
992+
fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
993+
if err != nil {
994+
return nil, nil, err
995+
}
996+
syncFile = os.NewFile(uintptr(fds[0]), "sync FD")
997+
defer syncFile.Close()
998+
999+
f := os.NewFile(uintptr(fds[1]), "sync other FD")
1000+
donations.DonateAndClose("sync-userns-fd", f)
1001+
if cmd.SysProcAttr == nil {
1002+
cmd.SysProcAttr = &unix.SysProcAttr{}
1003+
}
1004+
cmd.SysProcAttr.AmbientCaps = []uintptr{
1005+
unix.CAP_CHOWN,
1006+
unix.CAP_DAC_OVERRIDE,
1007+
unix.CAP_DAC_READ_SEARCH,
1008+
unix.CAP_FOWNER,
1009+
unix.CAP_FSETID,
1010+
unix.CAP_SYS_CHROOT,
1011+
unix.CAP_SETUID,
1012+
unix.CAP_SETGID,
1013+
unix.CAP_SYS_ADMIN,
1014+
unix.CAP_SETPCAP,
1015+
}
1016+
nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
9831017
}
9841018

9851019
donations.Transfer(cmd, nextFD)
@@ -990,6 +1024,43 @@ func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bu
9901024
if err := specutils.StartInNS(cmd, nss); err != nil {
9911025
return nil, nil, fmt.Errorf("gofer: %v", err)
9921026
}
1027+
1028+
if rootlessEUID {
1029+
log.Debugf("Setting user mappings")
1030+
args := []string{strconv.Itoa(cmd.Process.Pid)}
1031+
for _, idMap := range spec.Linux.UIDMappings {
1032+
log.Infof("Mapping host uid %d to container uid %d (size=%d)",
1033+
idMap.HostID, idMap.ContainerID, idMap.Size)
1034+
args = append(args,
1035+
strconv.Itoa(int(idMap.ContainerID)),
1036+
strconv.Itoa(int(idMap.HostID)),
1037+
strconv.Itoa(int(idMap.Size)),
1038+
)
1039+
}
1040+
1041+
out, err := exec.Command("newuidmap", args...).CombinedOutput()
1042+
log.Debugf("newuidmap: %#v\n%s", args, out)
1043+
if err != nil {
1044+
return nil, nil, fmt.Errorf("newuidmap failed: %w", err)
1045+
}
1046+
1047+
args = []string{strconv.Itoa(cmd.Process.Pid)}
1048+
for _, idMap := range spec.Linux.GIDMappings {
1049+
log.Infof("Mapping host uid %d to container uid %d (size=%d)",
1050+
idMap.HostID, idMap.ContainerID, idMap.Size)
1051+
args = append(args,
1052+
strconv.Itoa(int(idMap.ContainerID)),
1053+
strconv.Itoa(int(idMap.HostID)),
1054+
strconv.Itoa(int(idMap.Size)),
1055+
)
1056+
}
1057+
out, err = exec.Command("newgidmap", args...).CombinedOutput()
1058+
log.Debugf("newgidmap: %#v\n%s", args, out)
1059+
if err != nil {
1060+
return nil, nil, fmt.Errorf("newgidmap failed: %w", err)
1061+
}
1062+
}
1063+
9931064
log.Infof("Gofer started, PID: %d", cmd.Process.Pid)
9941065
c.GoferPid = cmd.Process.Pid
9951066
c.goferIsChild = true

runsc/sandbox/sandbox.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package sandbox
1818
import (
1919
"context"
2020
"encoding/json"
21+
"errors"
2122
"fmt"
2223
"io"
2324
"math"
@@ -536,6 +537,7 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
536537
donations := donation.Agency{}
537538
defer donations.Close()
538539

540+
rootlessEUID := unix.Getuid() != 0
539541
//
540542
// These flags must come BEFORE the "boot" command in cmd.Args.
541543
//
@@ -722,13 +724,13 @@ func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyn
722724
if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
723725
log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid())
724726
log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!")
725-
} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
727+
} else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
726728
log.Infof("Sandbox will be started in new user namespace")
727729
nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
728730
cmd.Args = append(cmd.Args, "--setup-root")
729731

730732
const nobody = 65534
731-
if conf.Rootless {
733+
if rootlessEUID || conf.Rootless {
732734
log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
733735
} else {
734736
// Map nobody in the new namespace to nobody in the parent namespace.
@@ -1419,6 +1421,10 @@ func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error
14191421
for _, file := range stdios {
14201422
log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID)
14211423
if err := file.Chown(s.UID, s.GID); err != nil {
1424+
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
1425+
log.Warningf("can't change an owner of %s: %s", file.Name(), err)
1426+
continue
1427+
}
14221428
return err
14231429
}
14241430
}

runsc/specutils/namespace.go

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ func setNS(fd, nsType uintptr) error {
119119
// that will restore the namespace to the original value.
120120
//
121121
// Preconditions: Must be called with os thread locked.
122-
func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
122+
func ApplyNS(ns specs.LinuxNamespace) (func() error, error) {
123123
log.Infof("Applying namespace %v at path %q", ns.Type, ns.Path)
124124
newNS, err := os.Open(ns.Path)
125125
if err != nil {
@@ -140,27 +140,49 @@ func ApplyNS(ns specs.LinuxNamespace) (func(), error) {
140140
oldNS.Close()
141141
return nil, fmt.Errorf("error setting namespace of type %v and path %q: %v", ns.Type, ns.Path, err)
142142
}
143-
return func() {
143+
return func() error {
144144
log.Infof("Restoring namespace %v", ns.Type)
145145
defer oldNS.Close()
146146
if err := setNS(oldNS.Fd(), flag); err != nil {
147-
panic(fmt.Sprintf("error restoring namespace: of type %v: %v", ns.Type, err))
147+
return fmt.Errorf("error restoring namespace: of type %v: %v", ns.Type, err)
148148
}
149+
return nil
149150
}, nil
150151
}
151152

152153
// StartInNS joins or creates the given namespaces and calls cmd.Start before
153154
// restoring the namespaces to the original values.
154155
func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
155-
// We are about to setup namespaces, which requires the os thread being
156-
// locked so that Go doesn't change the thread out from under us.
157-
runtime.LockOSThread()
158-
defer runtime.UnlockOSThread()
156+
errChan := make(chan error)
157+
go func() {
158+
runtime.LockOSThread()
159+
defer runtime.UnlockOSThread()
160+
161+
rstFuncs, err := startInNS(cmd, nss)
162+
errChan <- err
163+
for _, rstFunc := range rstFuncs {
164+
err := rstFunc()
165+
if err == nil {
166+
continue
167+
}
168+
169+
// One or more namespaces have not been restored, but
170+
// we can't destroy the current system thread, because
171+
// a child process is execited with Pdeathsig.
172+
log.Debugf("Block the current system thread due to: %s", err)
173+
c := make(chan interface{})
174+
<-c
175+
}
176+
}()
177+
return <-errChan
178+
}
159179

180+
func startInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) ([]func() error, error) {
160181
if cmd.SysProcAttr == nil {
161182
cmd.SysProcAttr = &unix.SysProcAttr{}
162183
}
163184

185+
var deferFuncs []func() error
164186
for _, ns := range nss {
165187
if ns.Path == "" {
166188
// No path. Just set a flag to create a new namespace.
@@ -171,12 +193,12 @@ func StartInNS(cmd *exec.Cmd, nss []specs.LinuxNamespace) error {
171193
// before exiting.
172194
restoreNS, err := ApplyNS(ns)
173195
if err != nil {
174-
return err
196+
return deferFuncs, err
175197
}
176-
defer restoreNS()
198+
deferFuncs = append(deferFuncs, restoreNS)
177199
}
178200

179-
return cmd.Start()
201+
return deferFuncs, cmd.Start()
180202
}
181203

182204
// SetUIDGIDMappings sets the given uid/gid mappings from the spec on the cmd.

0 commit comments

Comments
 (0)