https://github.com/opencontainers/runc
tagv1.2.5
一个容器启动主要分为三大部分,如文章题目所示
create
: 主要是为了解析、组装容器启动的配置和与子进程的消息通道等;
init
: 主要根据容器配置启动容器整个运行环境,包括熟知ns,cgroups, seccomp, apparmor, caps等;
start
: 主要是为了通知init 进程启动容器;
runc create:
运行runc create
时,后台生成该命令的进程,我们称该进程为parent;
parent进程中运行runc init
,我们称runc init
进程为child进程;
runc init:
child进程开始准备用户进程的运行环境,此时parent和child进程通过pipe进行通信;
child进程准备好用户进程的运行环境后,通知parent退出,自己则被exec.fifo阻塞;
由于parent退出(即runc create
退出),child成孤独进程,进而被1进程接收;
child进程一直被exec.fifo阻塞;
runc start:
运行runc start
时,会打开exec.fifo,使child的阻塞消除,runc start
退出;
由于阻塞消除,child进程继续往下执行;
child进程使用用户定义的命令替换runc init
,从而child进程成为容器内的主进程;
容器启动完成。
生成容器配置
启动runc init 进程
// utils_linux.go
func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
if err := revisePidFile(context); err != nil {
return -1, err
}
// 生成容器配置
// 读取容器运行时的 config.json 配置, 转化为spec 结构体
spec, err := setupSpec(context)
if err != nil {
return -1, err
}
// ...
// 使用spec 构建容器其他配置
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
}
// ...
// 创建容器
// runner 是装载 init 进程的核心,在此前的工作都是以组装配置和校对配置为主,
// 现在正式把配置内容装载后运行init进程;
r := &runner{
// 是否指定当前进程不收集僵尸进程,托孤行为
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: !context.Bool("keep"),
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
// 热迁移工具的参数,在create 命令下该参数是空的
criuOpts: criuOpts,
// 是否需要初始化
init: true,
}
return r.run(spec.Process)
}
// utils_linux.go
func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcontainer.Container, error) {
//是否使用非root 的cgroup
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
}
// 根据OCI 规范创建 container 配置文件
// [进入CreateLibcontainerConfig]
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
// 是否使用systemd-cgroup, 不使用的话默认选择 user.slice
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
//是否不 pivotroot, 一般只有rootfs 在闪存上才不固定rootfs
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
RootlessEUID: os.Geteuid() != 0,
//获取EUID, 用于系统决定用户对系统资源的访问权限,通常情况下等于RUID。 非root 情况启动;
RootlessCgroups: rootlessCg,
})
if err != nil {
return nil, err
}
root := context.GlobalString("root")
// 创建容器
return libcontainer.Create(root, id, config)
}
创建container 配置,主要包含
指定工作路径
指定根目录
增加挂载设备
绑定cgroup
设置oom selinux等
注册启动钩子
// libcontainer/specconv/spec_linux.go
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
//让runc 的工作目录固定在 spec.Root.Path 指定的目录下,没有指定即当前目录
cwd, err := getwd()
if err != nil {
return nil, err
}
spec := opts.Spec
if spec.Root == nil {
return nil, errors.New("root must be specified")
}
// 指定rootfs, 在 config.json 里面指定了当前目录的 rootfs 文件夹
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
labels := []string{}
for k, v := range spec.Annotations {
labels = append(labels, k+"="+v)
}
// 将已有的createOpts 组装到最终的 config 上
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Domainname: spec.Domainname,
Labels: append(labels, "bundle="+cwd),
NoNewKeyring: opts.NoNewKeyring,
RootlessEUID: opts.RootlessEUID,
RootlessCgroups: opts.RootlessCgroups,
}
// 根据规范挂载目录,对应的是config.json 的 mounts 字段
// 如: /proc, /dev, /dev/pts, /dev/shm, /dev/mqueue, /sys/, /sys/fs/cgroup 等
for _, m := range spec.Mounts {
cm, err := createLibcontainerMount(cwd, m)
if err != nil {
return nil, fmt.Errorf("invalid mount %+v: %w", m, err)
}
config.Mounts = append(config.Mounts, cm)
}
defaultDevs, err := createDevices(spec, config)
if err != nil {
return nil, err
}
// 创建cgroup 资源控制的配置, 传入默认分区, 返回 cgroup 资源配置
/* 可控的资源对象
var legacySubsystems = []subsystem{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NetPrioGroup{},
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
&fs.RdmaGroup{},
&fs.NameGroup{GroupName: "misc"},
}
*/
c, err := CreateCgroupConfig(opts, defaultDevs)
if err != nil {
return nil, err
}
config.Cgroups = c
// ...
// ...
if spec.Process != nil {
// 设置 oom scoret
config.OomScoreAdj = spec.Process.OOMScoreAdj
// privileges
config.NoNewPrivileges = spec.Process.NoNewPrivileges
// umask
config.Umask = spec.Process.User.Umask
// selinux
config.ProcessLabel = spec.Process.SelinuxLabel
// 赋予容器部分root的能力
if spec.Process.Capabilities != nil {
config.Capabilities = &configs.Capabilities{
Bounding: spec.Process.Capabilities.Bounding,
Effective: spec.Process.Capabilities.Effective,
Permitted: spec.Process.Capabilities.Permitted,
Inheritable: spec.Process.Capabilities.Inheritable,
Ambient: spec.Process.Capabilities.Ambient,
}
}
if spec.Process.Scheduler != nil {
s := *spec.Process.Scheduler
config.Scheduler = &s
}
if spec.Process.IOPriority != nil {
ioPriority := *spec.Process.IOPriority
config.IOPriority = &ioPriority
}
}
// 容器生命周期钩子
createHooks(spec, config)
config.Version = specs.Version
return config, nil
}
设置文件描述符
设置容器的主进程为收容者
设置IO, 前台启动还是后台启动IO输出不同
启动容器
// utils_linux.go
func (r *runner) run(config *specs.Process) (int, error) {
// 这里就是前面提到ExtraFiles, 设定的fd 从3开始加, ExtraFiles
// 常用于增加容器 输入或 输出的 文件描述 符, 如将容器日志导出到文件。
if len(r.listenFDs) > 0 {
process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
}
baseFd := 3 + len(process.ExtraFiles)
procSelfFd, closer := utils.ProcThreadSelf("fd/")
defer closer()
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
_, err = os.Stat(filepath.Join(procSelfFd, strconv.Itoa(i)))
if err != nil {
return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err)
}
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
}
// 启用 containerd enableSubreaper 选项后,containerd 将在容器内部
// 的主进程上设置子进程收容者属性。这意味着容器内的子进程将由容器的主进程作为其子进程收容者,
// 而不是由容器的直接父进程收容。
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
// 设置进程的IO
// console 启动,还是后台启动,输入和输出的通道不同
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
if err != nil {
return -1, err
}
defer tty.Close()
switch r.action {
case CT_ACT_CREATE:
// 其实这几个action 最终实现的动作都差不多,后面还会有新的文章进行详解;
// 本次主要讲的是 create 动作
err = r.container.Start(process)
case CT_ACT_RESTORE:
err = r.container.Restore(process, r.criuOpts)
case CT_ACT_RUN:
err = r.container.Run(process)
default:
panic("Unknown action")
}
// ...
}
process.Init
作为一个布尔值(true/false
)用于标识 当前进程是否是容器的 init 进程。它的作用如下:
1. 当 process.Init == true
时,表示 该进程是容器的第一个进程(即 init
进程)。这个进程:
负责 创建新的 Namespace(如 PID
, NET
, MNT
等)。
负责 初始化容器(如 Rootfs
挂载、Cgroups 配置)。
作为 容器的主进程,它的 PID 通常是 1(容器内部)。
如果该进程退出,整个容器会停止。
2. 当 process.Init == false
时,表示 该进程是附加到现有容器中的新进程,通常用于 runc exec
:
该进程 不会创建新的 Namespace,而是进入已有容器的 Namespace。
该进程 不会影响容器的生命周期,即使它退出,容器仍然运行。
由 newSetnsProcess
处理,用于 runc exec
运行附加进程。
// libcontainer/container_linux.go
func (c *Container) start(process *Process) (retErr error) {
if c.config.Cgroups.Resources.SkipDevices {
return errors.New("can't start container with SkipDevices set")
}
// 容器的主进程创建
if process.Init {
if c.initProcessStartTime != 0 {
return errors.New("container already has init process")
}
// 创建exec.fifo ,用于后续阻塞容器进程,,待外部启动。
if err := c.createExecFifo(); err != nil {
return err
}
defer func() {
if retErr != nil {
// 执行完毕后,删除exec.fifo文件,看到这句应该大概猜到上面exec.fifo
//文件可能和 start 进程的执行有关系
c.deleteExecFifo()
}
}()
}
// 创建容器进程(场景为docker run) 或 创建已有容器附加进程(场景为docker exec)
parent, err := c.newParentProcess(process)
//...
// 启动进程
if err := parent.start(); err != nil {
return fmt.Errorf("unable to start container process: %w", err)
}
}
c.createExecFifo()
作用
当 runc exec
在一个正在运行的容器中执行新进程时:
createExecFifo()
创建一个 FIFO 文件,路径通常是:
/run/containerd/io.containerd.runtime.v2.task/
exec.fifo
的用途
场景 1:正常执行 runc exec
runc exec
进程启动后,会尝试 打开 exec.fifo
并阻塞等待。
容器的 setns
过程(进入容器的 Namespace)完成后,runc
写入 exec.fifo
。
exec
进程检测到 FIFO 被写入,解除阻塞,开始执行。
场景 2:防止 runc exec
竞态
如果 exec
进程 在 Namespace 还未完全切换时就执行,可能会导致:
进程启动时仍在宿主机 Namespace,而不是容器的环境。
可能会导致 exec
进程访问错误的 Cgroups 或文件系统。
交互流程
假设 runc exec
运行 /bin/bash
进入容器:
runc exec ->
创建 exec.fifo runc exec ->
阻塞等待 exec.fifo runc 进入容器 Namespace runc ->
写入 exec.fifo exec 进程检测到 FIFO 解除阻塞,开始执行 /bin/bash
中主要有以下两个逻辑,根据 p.Init 是否设置来分别调用。
newInitProcess
:
newInitProcess
用于 创建新的容器进程,即 容器的第一个进程(init process
)。它负责:
创建新的 Linux Namespace(如 PID
, NET
, IPC
, MNT
, USER
等)。
设置 cgroups 限制。
执行容器的 init
进程(通常是 sh
或 entrypoint
)。
挂载文件系统 并配置 Rootfs
适用场景
启动新的容器进程。
创建全新的 Namespace,并作为容器的第一个进程运行。
newSetnsProcess
:
newSetnsProcess
用于 进入已存在的容器 Namespace 并执行新的进程,主要用于 exec
操作。它的作用包括:
加入现有容器的 Namespace(setns()
)。
在容器中执行额外的进程(如 docker exec
)。
不会创建新的 Namespace,而是复用现有的。
适用场景
执行 runc exec
命令,在运行的容器中启动新的进程。
进入现有的容器 Namespace,复用其 PID
, NET
, MNT
, USER
等。
总结:
newInitProcess
→ runc run
(初始化 创建新容器 的process)
newSetnsProcess
→ runc exec
(初始化 在已有容器中执行进程 的 process)
// libcontainer/container_linux.go
func newProcessComm() (*processComm, error) {
var (
comm processComm
err error
)
// 创建init父子进程的通信管道;因为准备要从当前进程创建容器
// initSockParent, initSockChild
comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init")
if err != nil {
return nil, fmt.Errorf("unable to create init pipe: %w", err)
}
comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync")
if err != nil {
return nil, fmt.Errorf("unable to create sync pipe: %w", err)
}
comm.logPipeParent, comm.logPipeChild, err = os.Pipe()
if err != nil {
return nil, fmt.Errorf("unable to create log pipe: %w", err)
}
return &comm, nil
}
func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
comm, err := newProcessComm()
if err != nil {
return nil, err
}
// ...
//可以看到组装的 cmd 就是 runc init
cmd := exec.Command(exePath, "init")
cmd.Args[0] = os.Args[0]
cmd.Stdin = p.Stdin
cmd.Stdout = p.Stdout
cmd.Stderr = p.Stderr
cmd.Dir = c.config.Rootfs
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &unix.SysProcAttr{}
}
cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
}
// 可以看到后续创子进程执行 runc init, 将把initSockChild 与 syncSockChild
// 作为与主进程通信的管道
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
// ...
// Init specifies whether the process is the first process in the container.
if p.Init {
// docker run时调用
//...
return c.newInitProcess(p, cmd, comm)
}
// docker exec 时调用
// 设置进程到对应的namespace, 这个做法是对应到 exec, 即新进程加入容器
return c.newSetnsProcess(p, cmd, comm)
}
可对比看 newInitProcess 与 newSetnsProcess 的区别
func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
// 配置中 容器 可隔离的空间表, (ipc, pid, user...)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
// 将namespace打包为bootstrapData
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
init := &initProcess{
cmd: cmd,
comm: comm,
// 传递的cgroup 资源限额信息
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
container: c,
process: p,
// 传递的 bootstrapData为命名空间的压缩信息
bootstrapData: data,
}
//将init process 结构保存在container 结构的 initProcess 字段中。
c.initProcess = init
return init, nil
}
该函数主要创建容器的附加进程, 如 docker exec 进入容器所执行的附加进程。
如下可见创建附加进程时需要获取init进程的空间句柄等信息。
func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
// 获取container存在的initprocess 进程对应的空间等信息
state := c.currentState()
data, err := c.bootstrapData(0, state.NamespacePaths)
if err != nil {
return nil, err
}
proc := &setnsProcess{
cmd: cmd,
cgroupPaths: state.CgroupPaths,
rootlessCgroups: c.config.RootlessCgroups,
intelRdtPath: state.IntelRdtPath,
comm: comm,
manager: c.cgroupManager,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
initProcessPid: state.InitProcessPid,
}
//...
}
func (c *Container) currentState() *State { // ...
if c.initProcess != nil {
pid = c.initProcess.pid()
startTime, _ = c.initProcess.startTime()
externalDescriptors = c.initProcess.externalDescriptors()
}
// ...
// 从已有initprocess进程的pid 中找寻命名空间的文件句柄路径
if pid > 0 {
for _, ns := range c.config.Namespaces {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
}
}
return state
在 Kubernetes 的容器运行时(如 runc)中,以下阶段表示了容器的不同生命周期和操作:
容器中的init进程
procMount:该阶段负责挂载容器的 /proc 文件系统。在此阶段,容器的 /proc 目录被挂载到宿主机的 /proc/<容器ID>/ 目录下,这样容器就可以访问宿主机的进程信息。
procSeccomp:这个阶段涉及到 Linux 的安全机制之一,即 Seccomp(Secure Computing Mode)。Seccomp 允许管理员限制容器中的进程只能使用一小组特定的系统调用,从而增加容器的安全性。在 procSeccomp 阶段,容器的 Seccomp 配置被加载和应用。
procReady:在容器的生命周期中,当容器的主进程(entrypoint)准备好接受请求并正常运行时,会触发 procReady 阶段。这可以是容器启动的最后一个阶段,表示容器已准备就绪。
procHooks:procHooks 阶段是一个扩展点,允许用户在容器的不同生命周期阶段执行自定义脚本或操作。这些脚本可以在容器的不同事件(如创建、启动、停止等)发生时触发,以便进行一些额外的处理或操作。
// process_linux.go
func (p *initProcess) start() (retErr error) {
defer p.comm.closeParent()
// 启动子进程 调用 runc init
err := p.cmd.Start()
p.process.ops = p
// close the child-side of the pipes (controlled by child)
p.comm.closeChild()
if err != nil {
p.process.ops = nil
return fmt.Errorf("unable to start init: %w", err)
}
waitInit := initWaiter(p.comm.initSockParent)
// ...
// 限定子进程的cgroups,避免有进程逃离cgroup限定
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
}
}
//将bootstrapData写入init通道,runc init进程接收到会设置自身运行的namespaces等
if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
}
err = <-waitInit
if err != nil {
return err
}
//通过init pipe获取子进程的pid
childPid, err := p.getChildPid()
if err != nil {
return fmt.Errorf("can't get final child's PID from pipe: %w", err)
}
// 获取子进程的文件描述符路径
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(childPid)
if err != nil {
return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
}
p.setExternalDescriptors(fds)
// 等待nsexec进程执行,这部分因为go语言对于namespace支持缺陷,
// 导致这部分实现没有使用go语言,采用C语言实现,通过init-c 这个管道获取 pid信息,
// 然后接收上一步中的bootstrapData,设置进程的namspace,最后runc init go语言实现部分逻辑。
// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return fmt.Errorf("error waiting for our first child to exit: %w", err)
}
var mountRequest mountSourceRequestFn
if !p.container.config.RootlessEUID {
request, cancel, err := p.goCreateMountSources(context.Background())
if err != nil {
return fmt.Errorf("error spawning mount remapping thread: %w", err)
}
defer cancel()
mountRequest = request
}
if err := p.createNetworkInterfaces(); err != nil {
return fmt.Errorf("error creating network interfaces: %w", err)
}
if err := p.updateSpecState(); err != nil {
return fmt.Errorf("error updating spec state: %w", err)
}
// 发送init配置给 init 进程
if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
return fmt.Errorf("error sending config to init process: %w", err)
}
// 和初始化进程的进行状态同步
// parseSync 是会循环到socket 关闭
var seenProcReady bool
ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
switch sync.Type {
// procMount:该阶段负责挂载容器的 /proc 文件系统。在此阶段,
// 容器的 /proc 目录被挂载到宿主机的 /proc/<容器ID>/ 目录下,
// 这样容器就可以访问宿主机的进程信息。
case procMountPlease:
//...
// 允许管理员限制容器中的进程只能使用一小组特定的系统调用,从而增加容器的安全性。
// 在 procSeccomp 阶段,容器的 Seccomp 配置被加载和应用。
case procSeccomp:
// 在容器的生命周期中,当容器的主进程(entrypoint)准备好接受请求并正常运行时,
// 会触发 procReady 阶段。这可以是容器启动的最后一个阶段,表示容器已准备就绪。
case procReady:
seenProcReady = true
// rlimits机制通过设置资源限制参数来管理进程可使用的资源量
// Set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return fmt.Errorf("error setting rlimits for ready process: %w", err)
}
// 设置container启动时间
// generate a timestamp indicating when the container was started
p.container.created = time.Now().UTC()
p.container.state = &createdState{
c: p.container,
}
// 更新容器组状态
state, uerr := p.container.updateState(p)
if uerr != nil {
return fmt.Errorf("unable to store init state: %w", uerr)
}
p.container.initProcessStartTime = state.InitProcessStartTime
// Sync with child.
if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
return err
}
//procHooks 阶段是一个扩展点,允许用户在容器的不同生命周期阶段执行自定义脚本或操作。
// 这些脚本可以在容器的不同事件(如创建、启动、停止等)发生时触发,以便进行一些额外的处理或操作。
case procHooks:
// Sync with child.
if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil {
return err
}
default:
return errors.New("invalid JSON payload from child")
}
return nil
})
// 关闭与init 的通信管道
if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
return err
}
if !seenProcReady && ierr == nil {
ierr = errors.New("procReady not received")
}
if ierr != nil {
return fmt.Errorf("error during container init: %w", ierr)
}
return nil
}
runc init 是由 runc create 过程启动的子进程中执行的命令,该进程为容器中的init进程或附加进程
# runc/init.go
package main
import (
"os"
"github.com/opencontainers/runc/libcontainer"
// 这个包非常重要,是 runc init 启动的基石
_ "github.com/opencontainers/runc/libcontainer/nsenter"
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
// This is the golang entry point for runc init, executed
// before main() but after libcontainer/nsenter's nsexec().
libcontainer.Init()
}
}
nsenter 实际是对c 语言接口 nsexec 的封装
这个代码利用了 GCC 的 constructor 特性,init 会在 runtimel.main()(不是 main.main()) 函数之前执行, 这样保证了启动时是单线程的,这一点很重要。因为 linux 不允许在多线程中通过 setns 设置 user namespace。
// libcontainer/nsenter/nsenter.go
//go:build linux && !gccgo
// +build linux,!gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
}
*/
import "C"
当使用 nsexec 工具时,创建孙进程的设计也与下面几个方面相关: 命令执行的隔离性:通过创建孙进程,nsexec 工具确保了用户指定的命令在新的命名空间中执行时, 不会继承父进程的权限和资源。这样可以保持命令在命名空间中的隔离性,避免可能的权限泄漏或资源冲突。 进程层次结构清晰:创建孙进程使得进程层次结构更加清晰和直观。 父进程作为 nsexec 工具的管理进程,子进程作为新命名空间中的中间进程, 而孙进程则是实际执行用户指定命令的进程。这种层次结构的设计使得进程间的关系更加明确,方便管理和跟踪。 命令执行的控制:通过子进程等待孙进程的完成,nsexec 工具可以控制命令的执行过程。 子进程等待孙进程的退出状态,以便获取命令的执行结果。这样可以在命令完成后进行后续处理, 如输出结果的收集、错误处理等。 错误处理和资源回收:创建孙进程也有助于错误处理和资源回收。 如果命令执行出现错误或异常情况,孙进程可以及时退出并返回相应的错误状态。 父进程和子进程可以在命令执行结束后进行必要的资源清理和回收,以确保系统的稳定性和资源管理。 需要注意的是,nsexec 工具的实现可能会根据具体的使用场景和需求而有所差异。 上述描述提供了一般情况下的常见设计和原理,但具体实现可能会有细微的差别。 如果你对 nsexec 工具的详细实现感兴趣,建议查阅相关的文档或源代码,以获取更准确和具体的信息
进程同步标志扩展,以下是与进程同步相关的常量和它们的含义:
SYNC_USERMAP_PLS:这是一个与用户映射(user namespace)相关的同步操作。 在容器中,用户命名空间被用于隔离用户和组标识符,以提供额外的安全性。
SYNC_USERMAP_PLS 表示进行用户映射同步的请求。
SYNC_RECVPID_PLS:这个常量表示接收进程 ID(PID)的同步请求。 在容器中,接收进程 ID 是指在容器内部创建的进程接收到来自宿主机的进程 ID。 通过进行同步,容器内部的进程可以感知到宿主机的进程 ID。
SYNC_TIMEOFFSETS_PLS:这是一个与时间偏移同步相关的常量。 在容器中,由于不同的时间管理机制,容器内部的时间可能与宿主机的时间存在差异。
SYNC_TIMEOFFSETS_PLS 表示进行时间偏移同步的请求,以确保容器内部的时间与宿主机保持一致。
SYNC_CHILD_FINISH:这个常量表示子进程完成同步的请求。 在容器中,当一个子进程完成其任务并退出时,它需要通知容器管理器(如 runc)以便进行相应的处理。 SYNC_CHILD_FINISH 表示子进程完成同步的请求,以便容器管理器能够及时处理相关事务。
//libcontainer/nsenter/nsexec.c
void nsexec(void)
{
int pipenum;
jmp_buf env;
int sync_child_pipe[2], sync_grandchild_pipe[2];
struct nlconfig_t config = { 0 };
/*
* Setup a pipe to send logs to the parent. This should happen
* first, because bail will use that pipe.
*/
setup_logpipe();
/*
* Get the init pipe fd from the environment. The init pipe is used to
* read the bootstrap data and tell the parent what the new pids are
* after the setup is done.
*/
// 从环境变量 _LIBCONTAINER_INITPIPE 中取得 child pipe 的 fd 编号
pipenum = getenv_int("_LIBCONTAINER_INITPIPE");
if (pipenum < 0) {
// 由于正常启动的 runc 是没有这个环境变量的,
// 所以这里会直接返回,然后就开始正常的执行 go 程序了
/* We are not a runc init. Just return to go runtime. */
return;
}
/*
* Inform the parent we're past initial setup.
* For the other side of this, see initWaiter.
*/
if (write(pipenum, "", 1) != 1)
bail("could not inform the parent we are past initial setup");
write_log(DEBUG, "=> nsexec container setup");
// // 从 child pipe 中读取 namespace config
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
/* Set oom_score_adj. This has to be done before !dumpable because
* /proc/self/oom_score_adj is not writeable unless you're an privileged
* user (if !dumpable is set). All children inherit their parent's
* oom_score_adj value on fork(2) so this will always be propagated
* properly.
*/
// 设置 oom score,这个只能在特权模式下设置,所以在这里就要修改完成
update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
/*
* Make the process non-dumpable, to avoid various race conditions that
* could cause processes in namespaces we're joining to access host
* resources (or potentially execute code).
*
* However, if the number of namespaces we are joining is 0, we are not
* going to be switching to a different security context. Thus setting
* ourselves to be non-dumpable only breaks things (like rootless
* containers), which is the recommendation from the kernel folks.
*/
if (config.namespaces) {
write_log(DEBUG, "set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable");
}
// 创建和子进程通信的 pipe
/* Pipe so we can tell the child when we've finished setting up. */
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
bail("failed to setup sync pipe between parent and child");
/*
* We need a new socketpair to sync with grandchild so we don't have
* race condition with child.
*/
// 创建和孙进程通信的 pipe
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
bail("failed to setup sync pipe between parent and grandchild");
/* TODO: Currently we aren't dealing with child deaths properly. */
/*
* Okay, so this is quite annoying.
*
* In order for this unsharing code to be more extensible we need to split
* up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
* would be if we did clone(CLONE_NEWUSER) and the other namespaces
* separately, but because of SELinux issues we cannot really do that. But
* we cannot just dump the namespace flags into clone(...) because several
* usecases (such as rootless containers) require more granularity around
* the namespace setup. In addition, some older kernels had issues where
* CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
* handle this while also dealing with SELinux so we choose SELinux support
* over broken kernel support).
*
* However, if we unshare(2) the user namespace *before* we clone(2), then
* all hell breaks loose.
*
* The parent no longer has permissions to do many things (unshare(2) drops
* all capabilities in your old namespace), and the container cannot be set
* up to have more than one {uid,gid} mapping. This is obviously less than
* ideal. In order to fix this, we have to first clone(2) and then unshare.
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
* /proc/pid/{setgroups,uid_map,gid_map}.
*
* And as a result of the above, we also need to setns(2) in the first child
* because if we join a PID namespace in the topmost parent then our child
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/
// setjmp 将当前执行位置的环境保存下来,用于多进程环境下的程序跳转
// 第一次执行的时候 setjmp 返回 0,对应 JUMP_PARENT
switch (setjmp(env)) {
/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: STAGE_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/
case STAGE_PARENT:{
int len;
pid_t stage1_pid = -1, stage2_pid = -1;
bool stage1_complete, stage2_complete;
/* For debugging. */
current_stage = STAGE_PARENT;
// 设置当前进程名,用于调试
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-0");
/* Start the process of getting a container. */
write_log(DEBUG, "spawn stage-1");
// clone_parent 创建了和当前进程完全一致的一个进程(子进程)
// 在 clone_parent 中,通过 longjmp() 跳转到 env 保存的位置
// 并且 setjmp 返回值为 JUMP_CHILD
// 这样这个子进程就会根据 switch 执行到 JUMP_CHILD 分支
// 而当前 runc init 和 子 runc init 之间通过上面创建的
// sync_child_pipe 进行同步通信
stage1_pid = clone_parent(&env, STAGE_CHILD);
if (stage1_pid < 0)
bail("unable to spawn stage-1");
syncfd = sync_child_pipe[1];
if (close(sync_child_pipe[0]) < 0)
bail("failed to close sync_child_pipe[0] fd");
/*
* State machine for synchronisation with the children. We only
* return once both the child and grandchild are ready.
*/
write_log(DEBUG, "-> stage-1 synchronisation loop");
stage1_complete = false;
while (!stage1_complete) {
enum sync_t s;
// 通过 sync_child_pipe 循环读取来自子进程的消息
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with stage-1: next state");
switch (s) {
case SYNC_USERMAP_PLS:
write_log(DEBUG, "stage-1 requested userns mappings");
/*
* Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're
* creating a rootless container for single-entry mapping.
* i.e. config.is_setgroup == false.
* (this is required since Linux 3.19).
*
* For rootless multi-entry mapping, config.is_setgroup shall be true and
* newuidmap/newgidmap shall be used.
*/
if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(stage1_pid, SETGROUPS_DENY);
// 这里设置 user map,因为子进程修改自身的 user namespace 之后,
// 就没有权限再设置 user map 了
// 主要为用来权限控制,可以参考 rootless 以及 usermape 技术
/* Set up mappings. */
update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
}
break;
case SYNC_RECVPID_PLS:
write_log(DEBUG, "stage-1 requested pid to be forwarded");
// 接收孙进程(还是 runc init)的 pid
/* Get the stage-2 pid. */
if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with stage-1: read(stage2_pid)");
}
// 向子进程发送 SYNC_RECVPID_ACK,表示处理完成
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
}
/*
* Send both the stage-1 and stage-2 pids back to runc.
* runc needs the stage-2 to continue process management,
* but because stage-1 was spawned with CLONE_PARENT we
* cannot reap it within stage-0 and thus we need to ask
* runc to reap the zombie for us.
*/
write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
stage1_pid, stage2_pid);
// 通过容器外传进来的 child pipe 把子和孙进程 PID,写回去,然后让容器外的 runc 接管 PID
// 这个是因为 clone_parent 的时候参数传了 CLONE_PARENT,
// 导致子孙的父进程都是容器外的那个 runc
// 所以当前进程无法接管这些 PID
len =
dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
stage2_pid);
if (len < 0) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with runc: write(pid-JSON)");
}
break;
case SYNC_TIMEOFFSETS_PLS:
//这是一个与时间偏移同步相关的常量。
//在容器中,由于不同的时间管理机制,容器内部的时间可能与宿主机的时间存在差异。
//SYNC_TIMEOFFSETS_PLS 表示进行时间偏移同步的请求,
// 以确保容器内部的时间与宿主机保持一致。
write_log(DEBUG, "stage-1 requested timens offsets to be configured");
update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len);
s = SYNC_TIMEOFFSETS_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)");
}
break;
case SYNC_CHILD_FINISH:
// 这个常量表示子进程完成同步的请求。
// 在容器中,当一个子进程完成其任务并退出时,
// 它需要通知容器管理器(如 runc)以便进行相应的处理。
// SYNC_CHILD_FINISH 表示子进程完成同步的请求,
// 以便容器管理器能够及时处理相关事务。
write_log(DEBUG, "stage-1 complete");
stage1_complete = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
write_log(DEBUG, "<- stage-1 synchronisation loop");
/* Now sync with grandchild. */
syncfd = sync_grandchild_pipe[1];
if (close(sync_grandchild_pipe[0]) < 0)
bail("failed to close sync_grandchild_pipe[0] fd");
write_log(DEBUG, "-> stage-2 synchronisation loop");
stage2_complete = false;
while (!stage2_complete) {
// 等待孙进程处理完毕
enum sync_t s;
write_log(DEBUG, "signalling stage-2 to run");
s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
}
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-2 complete");
stage2_complete = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
write_log(DEBUG, "<- stage-2 synchronisation loop");
write_log(DEBUG, "<~ nsexec stage-0");
exit(0);
}
break;
/*
* Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all of
* the requested namespaces. If we've been asked to CLONE_NEWUSER,
* we will ask our parent (stage 0) to set up our user mappings
* for us. Then, we create a new child (stage 2: STAGE_INIT) for
* PID namespace. We then send the child's PID to our parent
* (stage 0).
*/
case STAGE_CHILD:{
pid_t stage2_pid = -1;
enum sync_t s;
/* For debugging. */
current_stage = STAGE_CHILD;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = sync_child_pipe[0];
if (close(sync_child_pipe[1]) < 0)
bail("failed to close sync_child_pipe[1] fd");
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-1");
/*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: STAGE_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*/
// 通过 setns 加入现有的 namespace
if (config.namespaces)
join_namespaces(config.namespaces);
/*
* Deal with user namespaces first. They are quite special, as they
* affect our ability to unshare other namespaces and are used as
* context for privilege checks.
*
* We don't unshare all namespaces in one go. The reason for this
* is that, while the kernel documentation may claim otherwise,
* there are certain cases where unsharing all namespaces at once
* will result in namespace objects being owned incorrectly.
* Ideally we should just fix these kernel bugs, but it's better to
* be safe than sorry, and fix them separately.
*
* A specific case of this is that the SELinux label of the
* internal kern-mount that mqueue uses will be incorrect if the
* UTS namespace is cloned before the USER namespace is mapped.
* I've also heard of similar problems with the network namespace
* in some scenarios. This also mirrors how LXC deals with this
* problem.
*/
// 如果 clone flag 里有 CLONE_NEWUSER,说明需要创建新的 user namespace,
// 此处调用 unshare 进行了处理
if (config.cloneflags & CLONE_NEWUSER) {
try_unshare(CLONE_NEWUSER, "user namespace");
config.cloneflags &= ~CLONE_NEWUSER;
/*
* We need to set ourselves as dumpable temporarily so that the
* parent process can write to our procfs files.
*/
if (config.namespaces) {
write_log(DEBUG, "temporarily set process as dumpable");
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to temporarily set process as dumpable");
}
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
// 等待父 runc init 配置 user map
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */
write_log(DEBUG, "waiting stage-0 to complete the mapping of user namespace");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Revert temporary re-dumpable setting. */
if (config.namespaces) {
write_log(DEBUG, "re-set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to re-set process as non-dumpable");
}
/* Become root in the namespace proper. */
// 设置当前进程的 uid 为 0,即容器内的 root
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
try_unshare(config.cloneflags, "remaining namespaces");
if (config.timensoffset) {
write_log(DEBUG, "request stage-0 to write timens offsets");
s = SYNC_TIMEOFFSETS_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)");
if (s != SYNC_TIMEOFFSETS_ACK)
bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
}
/*
* TODO: What about non-namespace clone flags that we're dropping here?
*
* We fork again because of PID namespace, setns(2) or unshare(2) don't
* change the PID namespace of the calling process, because doing so
* would change the caller's idea of its own PID (as reported by getpid()),
* which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace.
*/
write_log(DEBUG, "spawn stage-2");
// 创建孙进程,当前进程已经完成了 namespace 的设置,孙进程会继承这些设置
stage2_pid = clone_parent(&env, STAGE_INIT);
if (stage2_pid < 0)
bail("unable to spawn stage-2");
/* Send the child to our parent, which knows what it's doing. */
write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
}
// 将孙进程 PID 传给父 runc init
if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(stage2_pid)");
}
/* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
}
if (s != SYNC_RECVPID_ACK) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
}
write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
// 发送 SYNC_CHILD_FINISH; 给父 runc init
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
}
/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
write_log(DEBUG, "<~ nsexec stage-1");
exit(0);
}
break;
/*
* Stage 2: We're the final child process, and the only process that will
* actually return to the Go runtime. Our job is to just do the
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/
case STAGE_INIT:{
// 孙 runc init 是真正启动容器 entrypoint 的进程,并且在启动之前,
// 进行最后的环境准备工作
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
*/
enum sync_t s;
/* For debugging. */
current_stage = STAGE_INIT;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = sync_grandchild_pipe[0];
if (close(sync_grandchild_pipe[1]) < 0)
bail("failed to close sync_grandchild_pipe[1] fd");
if (close(sync_child_pipe[0]) < 0)
bail("failed to close sync_child_pipe[0] fd");
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-2");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
if (s != SYNC_GRANDCHILD)
bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
if (setsid() < 0)
bail("setsid failed");
if (setuid(0) < 0)
bail("setuid failed");
if (setgid(0) < 0)
bail("setgid failed");
if (!config.is_rootless_euid && config.is_setgroup) {
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
}
write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
/* Close sync pipes. */
if (close(sync_grandchild_pipe[0]) < 0)
bail("failed to close sync_grandchild_pipe[0] fd");
/* Free netlink data. */
nl_free(&config);
/* Finish executing, let the Go runtime take over. */
write_log(DEBUG, "<= nsexec container setup");
write_log(DEBUG, "booting up go runtime ...");
// 此时,父 / 祖父 runc init 都退出了(可能会有时差)
// 但是当前进程是不能直接退出的,所以这里单纯的 return,
// 然后开始执行 go 代码
return;
}
break;
default:
bail("unexpected jump value");
}
/* Should never be reached. */
bail("should never be reached");
}
// Normally, this function does not return. If it returns, with or without an
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
// Get the synchronisation pipe.
// 获取 init pipe, 这里是之前提到 init-c 这个文件描述fd
envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
syncPipeFd, err := strconv.Atoi(envSyncPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
}
// 通过fd 建立pipe 与父进程通信
syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
defer syncPipe.Close()
// 返回一个 starndar 的 linuxStandardInit, 如果我们是执行exec的话, 则返回linuxSetnsInit
// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe)
}
返回一个 starndar 的 linuxStandardInit, 如果我们是执行exec的话, 则返回linuxSetnsInit
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe *os.File) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}
// Clean the RLIMIT_NOFILE cache in go runtime.
// Issue: https://github.com/opencontainers/runc/issues/4195
maybeClearRlimitNofileCache(config.Rlimits)
switch t {
case initSetns:
i := &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
config: config,
logPipe: logPipe,
dmzExe: dmzExe,
}
return i.Init()
case initStandard:
i := &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
pidfdSocket: pidfdSocket,
parentPid: unix.Getppid(),
config: config,
fifoFile: fifoFile,
logPipe: logPipe,
dmzExe: dmzExe,
}
return i.Init()
}
return fmt.Errorf("unknown init type %q", t)
}
容器对创建的容器启动,是靠读取 新创建容器的exec.fifio,解除init进程阻塞使其执行后续流程来完成的。
// libcontainer/container_linux.go
func (c *Container) exec() error {
path := filepath.Join(c.stateDir, execFifoFilename)
pid := c.initProcess.pid()
// 读取 /run/runc//exec.fifo 管道,由于socketpair管道特性,
// 父进程(init进程)被读取信息后便不会阻塞,继续往下执行,关闭socket
blockingFifoOpenCh := awaitFifoOpen(path)
for {
select {
case result := <-blockingFifoOpenCh:
// handleFifoResult 最后读完内容后会删除掉 exec.fifo
return handleFifoResult(result)
case <-time.After(time.Millisecond * 100):
stat, err := system.Stat(pid)
if err != nil || stat.State == system.Zombie {
// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
if err := handleFifoResult(fifoOpen(path, false)); err != nil {
return errors.New("container process is already dead")
}
return nil
}
}
}
}
runc exec 是向容器的init 进程所在 空间增加启动额外进程。
应对如 docker exec 在容器中启动 命令行终端进程的场景
cmd终于的开始执行了,执行runc init
,init
程序会调用 nsexec.c
的代码(这也是一个子进程,而且是在init进程前启动,设置ns后就会退出);通过 _LIBCONTAINER_INITTYPE=setns
这个环境变量判别用什么模式进行setns,如果是standard
则是使用clone namespace 为容器建立新的namespace,这里我们是setns
所以是为容器指定了我们需要进入的进程namespace。这一步的信息传递是通过socket与nsexec.c
这个程序交互,进程间通信技术完成信息传递;容器启动的需要的namespace 数据放到setnsProcess.bootstrapData
内;
func (p *setnsProcess) start() (retErr error) {
defer p.comm.closeParent()
if p.process.IOPriority != nil {
if err := setIOPriority(p.process.IOPriority); err != nil {
return err
}
}
// get the "before" value of oom kill count
oom, _ := p.manager.OOMKillCount()
// 执行命令 runc init
err := p.cmd.Start()
// close the child-side of the pipes (controlled by child)
p.comm.closeChild()
if err != nil {
return fmt.Errorf("error starting setns process: %w", err)
}
// ...
// 将namespace 数据传输到nsenter 上
if p.bootstrapData != nil {
if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
}
}
err = <-waitInit
if err != nil {
return err
}
// 等待exec 设置namespace 数据, 并且通过消息管道返回父进程号,
// 然后设置process属性为runc init 的进程号,方便后面对process的操作;
if err := p.execSetns(); err != nil {
return fmt.Errorf("error executing setns process: %w", err)
}
// 把runc init进程加入到, 已有容器的cgroup中
for _, path := range p.cgroupPaths {
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
// Try to join the cgroup of InitProcessPid.
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
if initCgErr == nil {
if initCgPath, ok := initCg[""]; ok {
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
}
}
}
if err != nil {
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}
}
}
// ...
if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
return err
}
if !seenProcReady && ierr == nil {
ierr = errors.New("procReady not received")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
_, _ = p.wait()
return ierr
}
return nil
}
可以看到容器停止,主要用到了cgroup的机制做冻结。
// Pause pauses the container, if its state is RUNNING or CREATED, changing
// its state to PAUSED. If the state is already PAUSED, does nothing.
func (c *Container) Pause() error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
switch status {
case Running, Created:
//冻结
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
return ErrNotRunning
}
Container.Resume
// Resume resumes the execution of any user processes in the
// container before setting the container state to RUNNING.
// This is only performed if the current state is PAUSED.
// If the Container state is RUNNING, does nothing.
func (c *Container) Resume() error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status != Paused {
return ErrNotPaused
}
// 可以看到容器停止后继续执行也是使用了cgroup的机制
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return c.state.transition(&runningState{
c: c,
})
}
https://juejin.cn/post/6903527508784873485
https://www.51cto.com/article/744467.html
https://blog.csdn.net/m0_45406092/article/details/130660743
https://imkira.com/runc/
https://fankangbest.github.io/2017/11/22/runc%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90(%E4%B8%80)-create%E5%92%8Cstart%E6%B5%81%E7%A8%8B-v1-0-0-rc2/
https://fankangbest.github.io/2017/11/22/runc%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90(%E4%B8%80)-create%E5%92%8Cstart%E6%B5%81%E7%A8%8B-v1-0-0-rc2/
https://www.cyisme.top/cloud_native/containerd/run/
https://jimmysong.io/book/kubernetes-handbook/objects/pause-container/
https://github.com/rfyiamcool/notes/blob/main/kubernetes_pause_code.md
https://blog.csdn.net/zhonglinzhang/article/details/99458561
https://arthurchiao.art/blog/what-happens-when-k8s-creates-pods-5-zh/
https://zhuanlan.zhihu.com/p/693687096
https://juejin.cn/post/6903527508784873485