容器运行时 源码分析

源码地址

https://github.com/opencontainers/runc

tagv1.2.5

整体流程

一个容器启动主要分为三大部分,如文章题目所示

  • create: 主要是为了解析、组装容器启动的配置和与子进程的消息通道等;

  • init : 主要根据容器配置启动容器整个运行环境,包括熟知ns,cgroups, seccomp, apparmor, caps等;

  • start : 主要是为了通知init 进程启动容器;

容器运行时 源码分析_第1张图片

runc create:

  1. 运行runc create时,后台生成该命令的进程,我们称该进程为parent;

  2. parent进程中运行runc init,我们称runc init进程为child进程;

runc init:

  1. child进程开始准备用户进程的运行环境,此时parent和child进程通过pipe进行通信;

  2. child进程准备好用户进程的运行环境后,通知parent退出,自己则被exec.fifo阻塞;

  3. 由于parent退出(即runc create退出),child成孤独进程,进而被1进程接收;

  4. child进程一直被exec.fifo阻塞;

runc start:

  1. 运行runc start时,会打开exec.fifo,使child的阻塞消除,runc start退出;

  2. 由于阻塞消除,child进程继续往下执行;

  3. child进程使用用户定义的命令替换runc init,从而child进程成为容器内的主进程;

  4. 容器启动完成。

runc create

startContainer

  1. 生成容器配置

  2. 启动runc init 进程

// utils_linux.go

func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
    if err := revisePidFile(context); err != nil {
        return -1, err
    }
    
    // 生成容器配置
    // 读取容器运行时的 config.json 配置, 转化为spec 结构体
    spec, err := setupSpec(context)
    if err != nil {
        return -1, err
    }
    // ...
    
    // 使用spec 构建容器其他配置
    container, err := createContainer(context, id, spec)
    if err != nil {
        return -1, err
    }

    // ...
    
    
    //  创建容器
    //  runner 是装载 init 进程的核心,在此前的工作都是以组装配置和校对配置为主,
    //  现在正式把配置内容装载后运行init进程;
    r := &runner{
 
        // 是否指定当前进程不收集僵尸进程,托孤行为
        enableSubreaper: !context.Bool("no-subreaper"),
        shouldDestroy:   !context.Bool("keep"),
        container:       container,
        listenFDs:       listenFDs,
        notifySocket:    notifySocket,
        consoleSocket:   context.String("console-socket"),
        pidfdSocket:     context.String("pidfd-socket"),
        detach:          context.Bool("detach"),
        pidFile:         context.String("pid-file"),
        preserveFDs:     context.Int("preserve-fds"),
        action:          action,
        // 热迁移工具的参数,在create 命令下该参数是空的
        criuOpts:        criuOpts,
        // 是否需要初始化
        init:            true,
    }
    return r.run(spec.Process)
}

createContainer

// utils_linux.go

func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcontainer.Container, error) {
    //是否使用非root 的cgroup
    rootlessCg, err := shouldUseRootlessCgroupManager(context)
    if err != nil {
        return nil, err
    }
    // 根据OCI 规范创建 container 配置文件
    // [进入CreateLibcontainerConfig]
    config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
        CgroupName:       id,
        // 是否使用systemd-cgroup, 不使用的话默认选择 user.slice
        UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
        //是否不 pivotroot, 一般只有rootfs 在闪存上才不固定rootfs
        NoPivotRoot:      context.Bool("no-pivot"),
        NoNewKeyring:     context.Bool("no-new-keyring"),
        Spec:             spec,
        RootlessEUID:     os.Geteuid() != 0,
        //获取EUID, 用于系统决定用户对系统资源的访问权限,通常情况下等于RUID。 非root 情况启动;
        RootlessCgroups:  rootlessCg,
    })
    if err != nil {
        return nil, err
    }

    root := context.GlobalString("root")
    // 创建容器
    return libcontainer.Create(root, id, config)
}

CreateLibcontainerConfig

创建container 配置,主要包含

  • 指定工作路径

  • 指定根目录

  • 增加挂载设备

  • 绑定cgroup

  • 设置oom selinux等

  • 注册启动钩子

// libcontainer/specconv/spec_linux.go

func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
    //让runc 的工作目录固定在 spec.Root.Path 指定的目录下,没有指定即当前目录
    cwd, err := getwd()
    if err != nil {
        return nil, err
    }
    spec := opts.Spec
    if spec.Root == nil {
        return nil, errors.New("root must be specified")
    }
    // 指定rootfs, 在 config.json 里面指定了当前目录的 rootfs 文件夹
    rootfsPath := spec.Root.Path
    if !filepath.IsAbs(rootfsPath) {
        rootfsPath = filepath.Join(cwd, rootfsPath)
    }
    labels := []string{}
    for k, v := range spec.Annotations {
        labels = append(labels, k+"="+v)
    }
    // 将已有的createOpts 组装到最终的 config 上
    config := &configs.Config{
        Rootfs:          rootfsPath,
        NoPivotRoot:     opts.NoPivotRoot,
        Readonlyfs:      spec.Root.Readonly,
        Hostname:        spec.Hostname,
        Domainname:      spec.Domainname,
        Labels:          append(labels, "bundle="+cwd),
        NoNewKeyring:    opts.NoNewKeyring,
        RootlessEUID:    opts.RootlessEUID,
        RootlessCgroups: opts.RootlessCgroups,
    }

    // 根据规范挂载目录,对应的是config.json 的 mounts 字段
// 如: /proc, /dev, /dev/pts, /dev/shm, /dev/mqueue, /sys/, /sys/fs/cgroup 等
    for _, m := range spec.Mounts {
        cm, err := createLibcontainerMount(cwd, m)
        if err != nil {
            return nil, fmt.Errorf("invalid mount %+v: %w", m, err)
        }
        config.Mounts = append(config.Mounts, cm)
    }

    defaultDevs, err := createDevices(spec, config)
    if err != nil {
        return nil, err
    }

// 创建cgroup 资源控制的配置, 传入默认分区, 返回 cgroup 资源配置        
/* 可控的资源对象
var legacySubsystems = []subsystem{
    &fs.CpusetGroup{},
    &fs.DevicesGroup{},
    &fs.MemoryGroup{},
    &fs.CpuGroup{},
    &fs.CpuacctGroup{},
    &fs.PidsGroup{},
    &fs.BlkioGroup{},
    &fs.HugetlbGroup{},
    &fs.PerfEventGroup{},
    &fs.FreezerGroup{},
    &fs.NetPrioGroup{},
    &fs.NetClsGroup{},
    &fs.NameGroup{GroupName: "name=systemd"},
    &fs.RdmaGroup{},
    &fs.NameGroup{GroupName: "misc"},
}  
*/

    c, err := CreateCgroupConfig(opts, defaultDevs)
    if err != nil {
        return nil, err
    }

    config.Cgroups = c
    // ...

    // ...
   
    if spec.Process != nil {
        //  设置 oom scoret
        config.OomScoreAdj = spec.Process.OOMScoreAdj
        //  privileges
        config.NoNewPrivileges = spec.Process.NoNewPrivileges
        //  umask
        config.Umask = spec.Process.User.Umask
        
        // selinux
        config.ProcessLabel = spec.Process.SelinuxLabel
        //  赋予容器部分root的能力
        if spec.Process.Capabilities != nil {
            config.Capabilities = &configs.Capabilities{
                Bounding:    spec.Process.Capabilities.Bounding,
                Effective:   spec.Process.Capabilities.Effective,
                Permitted:   spec.Process.Capabilities.Permitted,
                Inheritable: spec.Process.Capabilities.Inheritable,
                Ambient:     spec.Process.Capabilities.Ambient,
            }
        }
        if spec.Process.Scheduler != nil {
            s := *spec.Process.Scheduler
            config.Scheduler = &s
        }

        if spec.Process.IOPriority != nil {
            ioPriority := *spec.Process.IOPriority
            config.IOPriority = &ioPriority
        }
    }
    // 容器生命周期钩子
    createHooks(spec, config)
    config.Version = specs.Version
    return config, nil
}

runner.run

  1. 设置文件描述符

  2. 设置容器的主进程为收容者

  3. 设置IO, 前台启动还是后台启动IO输出不同

  4. 启动容器

// utils_linux.go

func (r *runner) run(config *specs.Process) (int, error) {
    
    
    // 这里就是前面提到ExtraFiles, 设定的fd 从3开始加, ExtraFiles
    // 常用于增加容器 输入或 输出的 文件描述 符, 如将容器日志导出到文件。
    if len(r.listenFDs) > 0 {
        process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
        process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
    }
    baseFd := 3 + len(process.ExtraFiles)
    procSelfFd, closer := utils.ProcThreadSelf("fd/")
    defer closer()
    for i := baseFd; i < baseFd+r.preserveFDs; i++ {
        _, err = os.Stat(filepath.Join(procSelfFd, strconv.Itoa(i)))
        if err != nil {
            return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err)
        }
        process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
    }
    

    
    // 启用 containerd enableSubreaper 选项后,containerd 将在容器内部
    // 的主进程上设置子进程收容者属性。这意味着容器内的子进程将由容器的主进程作为其子进程收容者,
    // 而不是由容器的直接父进程收容。
    handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
    
    // 设置进程的IO
    // console 启动,还是后台启动,输入和输出的通道不同
    tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
    if err != nil {
        return -1, err
    }
    defer tty.Close()


    switch r.action {
    case CT_ACT_CREATE:
    // 其实这几个action 最终实现的动作都差不多,后面还会有新的文章进行详解;
   // 本次主要讲的是 create 动作
        err = r.container.Start(process)
    case CT_ACT_RESTORE:
        err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
        err = r.container.Run(process)
    default:
        panic("Unknown action")
    }
    // ...
}

Container.start

process.Init 作为一个布尔值(true/false)用于标识 当前进程是否是容器的 init 进程。它的作用如下:

1. 当 process.Init == true 时,表示 该进程是容器的第一个进程(即 init 进程)。这个进程:

  • 负责 创建新的 Namespace(如 PID, NET, MNT 等)。

  • 负责 初始化容器(如 Rootfs 挂载、Cgroups 配置)。

  • 作为 容器的主进程,它的 PID 通常是 1(容器内部)。

  • 如果该进程退出,整个容器会停止。

2. 当 process.Init == false 时,表示 该进程是附加到现有容器中的新进程,通常用于 runc exec

  • 该进程 不会创建新的 Namespace,而是进入已有容器的 Namespace。

  • 该进程 不会影响容器的生命周期,即使它退出,容器仍然运行。

  • newSetnsProcess 处理,用于 runc exec 运行附加进程。

// libcontainer/container_linux.go
func (c *Container) start(process *Process) (retErr error) {
    if c.config.Cgroups.Resources.SkipDevices {
        return errors.New("can't start container with SkipDevices set")
    }
    // 容器的主进程创建
    if process.Init {
        if c.initProcessStartTime != 0 {
            return errors.New("container already has init process")
        }
        // 创建exec.fifo ,用于后续阻塞容器进程,,待外部启动。
        if err := c.createExecFifo(); err != nil {
            return err
        }
        defer func() {
            if retErr != nil {
                // 执行完毕后,删除exec.fifo文件,看到这句应该大概猜到上面exec.fifo 
                //文件可能和 start 进程的执行有关系
                c.deleteExecFifo()
            }
        }()
    }
    // 创建容器进程(场景为docker run) 或 创建已有容器附加进程(场景为docker exec)
    parent, err := c.newParentProcess(process)
    //...
    
    // 启动进程
    if err := parent.start(); err != nil { 
          return fmt.Errorf("unable to start container process: %w", err)
    } 

}

c.createExecFifo() 作用

runc exec 在一个正在运行的容器中执行新进程时:

  • createExecFifo() 创建一个 FIFO 文件,路径通常是:

/run/containerd/io.containerd.runtime.v2.task//exec.fifo

exec.fifo 的用途

场景 1:正常执行 runc exec

runc exec 进程启动后,会尝试 打开 exec.fifo 并阻塞等待

容器的 setns 过程(进入容器的 Namespace)完成后,runc 写入 exec.fifo

exec 进程检测到 FIFO 被写入,解除阻塞,开始执行。

场景 2:防止 runc exec 竞态

如果 exec 进程 在 Namespace 还未完全切换时就执行,可能会导致:

进程启动时仍在宿主机 Namespace,而不是容器的环境。

可能会导致 exec 进程访问错误的 Cgroups 或文件系统。

交互流程

假设 runc exec 运行 /bin/bash 进入容器:

runc exec -> 
创建 exec.fifo runc exec -> 
阻塞等待 exec.fifo runc 进入容器 Namespace runc -> 
写入 exec.fifo exec 进程检测到 FIFO 解除阻塞,开始执行 /bin/bash

Container.newParentProcess

中主要有以下两个逻辑,根据 p.Init 是否设置来分别调用。

  1. newInitProcess

      newInitProcess 用于 创建新的容器进程,即 容器的第一个进程init process)。它负责:

    1. 创建新的 Linux Namespace(如 PID, NET, IPC, MNT, USER 等)。

    2. 设置 cgroups 限制。

    3. 执行容器的 init 进程(通常是 shentrypoint)。

    4. 挂载文件系统 并配置 Rootfs

适用场景

启动新的容器进程

创建全新的 Namespace,并作为容器的第一个进程运行。

  1. newSetnsProcess

      newSetnsProcess 用于 进入已存在的容器 Namespace 并执行新的进程,主要用于 exec 操作。它的作用包括:

    1. 加入现有容器的 Namespacesetns())。

    2. 在容器中执行额外的进程(如 docker exec)。

    3. 不会创建新的 Namespace,而是复用现有的。

适用场景

  • 执行 runc exec 命令,在运行的容器中启动新的进程。

  • 进入现有的容器 Namespace,复用其 PID, NET, MNT, USER 等。

总结:

newInitProcessrunc run(初始化 创建新容器 的process)

newSetnsProcessrunc exec(初始化 在已有容器中执行进程 的 process)

// libcontainer/container_linux.go

func newProcessComm() (*processComm, error) {
    var (
        comm processComm
        err  error
    )
    // 创建init父子进程的通信管道;因为准备要从当前进程创建容器
    // initSockParent, initSockChild
    comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init")
    if err != nil {
        return nil, fmt.Errorf("unable to create init pipe: %w", err)
    }
    comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync")
    if err != nil {
        return nil, fmt.Errorf("unable to create sync pipe: %w", err)
    }
    comm.logPipeParent, comm.logPipeChild, err = os.Pipe()
    if err != nil {
        return nil, fmt.Errorf("unable to create log pipe: %w", err)
    }
    return &comm, nil
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
    comm, err := newProcessComm()
    if err != nil {
        return nil, err
    }
    // ...
 
    
    //可以看到组装的 cmd 就是 runc init
    cmd := exec.Command(exePath, "init")
    cmd.Args[0] = os.Args[0]
    cmd.Stdin = p.Stdin
    cmd.Stdout = p.Stdout
    cmd.Stderr = p.Stderr
    cmd.Dir = c.config.Rootfs
    if cmd.SysProcAttr == nil {
        cmd.SysProcAttr = &unix.SysProcAttr{}
    }
    cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
    cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
    if p.ConsoleSocket != nil {
        cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
        cmd.Env = append(cmd.Env,
            "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
        )
    }
    
    // 可以看到后续创子进程执行 runc init, 将把initSockChild 与 syncSockChild
    // 作为与主进程通信的管道
    cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
    cmd.Env = append(cmd.Env,
        "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
    )
    cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
    cmd.Env = append(cmd.Env,
        "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
    )

    // ...
    
    //  Init specifies whether the process is the first process in the container.
    if p.Init {
        // docker run时调用
        //...
        return c.newInitProcess(p, cmd, comm)
    }
    // docker exec 时调用
    // 设置进程到对应的namespace, 这个做法是对应到 exec, 即新进程加入容器
    return c.newSetnsProcess(p, cmd, comm)
}

container.newInitProcess

可对比看 newInitProcess 与 newSetnsProcess 的区别

func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
    cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
   
    nsMaps := make(map[configs.NamespaceType]string)
    // 配置中 容器 可隔离的空间表, (ipc, pid, user...)
    for _, ns := range c.config.Namespaces {
        if ns.Path != "" {
            nsMaps[ns.Type] = ns.Path
        }
    }
    // 将namespace打包为bootstrapData
    data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
    if err != nil {
        return nil, err
    }

    init := &initProcess{
        cmd:             cmd,
        comm:            comm,
        // 传递的cgroup 资源限额信息
        manager:         c.cgroupManager,
        intelRdtManager: c.intelRdtManager,
        config:          c.newInitConfig(p),
        container:       c,
        process:         p,
        // 传递的 bootstrapData为命名空间的压缩信息
        bootstrapData:   data,
    }
    //将init process 结构保存在container 结构的 initProcess 字段中。
    c.initProcess = init
    return init, nil
}

container.newSetnsProcess

该函数主要创建容器的附加进程, 如 docker exec 进入容器所执行的附加进程。

如下可见创建附加进程时需要获取init进程的空间句柄等信息。

func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
        cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
        // 获取container存在的initprocess 进程对应的空间等信息
        state := c.currentState()                                                                                                                                           
        data, err := c.bootstrapData(0, state.NamespacePaths)
        if err != nil {      
                return nil, err
        }                    
        proc := &setnsProcess{
                cmd:             cmd,
                cgroupPaths:     state.CgroupPaths,
                rootlessCgroups: c.config.RootlessCgroups,
                intelRdtPath:    state.IntelRdtPath,
                comm:            comm,
                manager:         c.cgroupManager,
                config:          c.newInitConfig(p),
                process:         p,
                bootstrapData:   data,
                initProcessPid:  state.InitProcessPid,
        }  
        //...
}        

func (c *Container) currentState() *State {                                                   // ...
        if c.initProcess != nil {
                pid = c.initProcess.pid()
                startTime, _ = c.initProcess.startTime()
                externalDescriptors = c.initProcess.externalDescriptors()
        }     
        // ...
        // 从已有initprocess进程的pid 中找寻命名空间的文件句柄路径
        if pid > 0 {   
                for _, ns := range c.config.Namespaces {
                        state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                }    
                for _, nsType := range configs.NamespaceTypes() {
                        if !configs.IsNamespaceSupported(nsType) {
                                continue
                        }
                        if _, ok := state.NamespacePaths[nsType]; !ok {
                                ns := configs.Namespace{Type: nsType}
                                state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                        }
                }    
        }            
        return state 

        
     

initProcess.start

在 Kubernetes 的容器运行时(如 runc)中,以下阶段表示了容器的不同生命周期和操作:

容器中的init进程

  • procMount:该阶段负责挂载容器的 /proc 文件系统。在此阶段,容器的 /proc 目录被挂载到宿主机的 /proc/<容器ID>/ 目录下,这样容器就可以访问宿主机的进程信息。

  • procSeccomp:这个阶段涉及到 Linux 的安全机制之一,即 Seccomp(Secure Computing Mode)。Seccomp 允许管理员限制容器中的进程只能使用一小组特定的系统调用,从而增加容器的安全性。在 procSeccomp 阶段,容器的 Seccomp 配置被加载和应用。

  • procReady:在容器的生命周期中,当容器的主进程(entrypoint)准备好接受请求并正常运行时,会触发 procReady 阶段。这可以是容器启动的最后一个阶段,表示容器已准备就绪。

  • procHooks:procHooks 阶段是一个扩展点,允许用户在容器的不同生命周期阶段执行自定义脚本或操作。这些脚本可以在容器的不同事件(如创建、启动、停止等)发生时触发,以便进行一些额外的处理或操作。

// process_linux.go
func (p *initProcess) start() (retErr error) {
    defer p.comm.closeParent()
    
    // 启动子进程 调用 runc init
    err := p.cmd.Start()
    p.process.ops = p
    // close the child-side of the pipes (controlled by child)
    p.comm.closeChild()
    if err != nil {
        p.process.ops = nil
        return fmt.Errorf("unable to start init: %w", err)
    }

    waitInit := initWaiter(p.comm.initSockParent)
    // ...
    //  限定子进程的cgroups,避免有进程逃离cgroup限定
    // Do this before syncing with child so that no children can escape the
    // cgroup. We don't need to worry about not doing this and not being root
    // because we'd be using the rootless cgroup manager in that case.
    if err := p.manager.Apply(p.pid()); err != nil {
        return fmt.Errorf("unable to apply cgroup configuration: %w", err)
    }
    if p.intelRdtManager != nil {
        if err := p.intelRdtManager.Apply(p.pid()); err != nil {
            return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
        }
    }
    //将bootstrapData写入init通道,runc init进程接收到会设置自身运行的namespaces等
    if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
        return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
    }
    err = <-waitInit
    if err != nil {
        return err
    }
    
    //通过init pipe获取子进程的pid
    childPid, err := p.getChildPid()
    if err != nil {
        return fmt.Errorf("can't get final child's PID from pipe: %w", err)
    }
    
    // 获取子进程的文件描述符路径
    // Save the standard descriptor names before the container process
    // can potentially move them (e.g., via dup2()).  If we don't do this now,
    // we won't know at checkpoint time which file descriptor to look up.
    fds, err := getPipeFds(childPid)
    if err != nil {
        return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
    }
    p.setExternalDescriptors(fds)

    // 等待nsexec进程执行,这部分因为go语言对于namespace支持缺陷,        
    // 导致这部分实现没有使用go语言,采用C语言实现,通过init-c 这个管道获取 pid信息,        
    // 然后接收上一步中的bootstrapData,设置进程的namspace,最后runc init go语言实现部分逻辑。

    // Wait for our first child to exit
    if err := p.waitForChildExit(childPid); err != nil {
        return fmt.Errorf("error waiting for our first child to exit: %w", err)
    }

    
    var mountRequest mountSourceRequestFn
    if !p.container.config.RootlessEUID {
        request, cancel, err := p.goCreateMountSources(context.Background())
        if err != nil {
            return fmt.Errorf("error spawning mount remapping thread: %w", err)
        }
        defer cancel()
        mountRequest = request
    }

    if err := p.createNetworkInterfaces(); err != nil {
        return fmt.Errorf("error creating network interfaces: %w", err)
    }
    if err := p.updateSpecState(); err != nil {
        return fmt.Errorf("error updating spec state: %w", err)
    }
    // 发送init配置给 init 进程
    if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
        return fmt.Errorf("error sending config to init process: %w", err)
    }

    // 和初始化进程的进行状态同步
    // parseSync 是会循环到socket 关闭
    var seenProcReady bool
    ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
        switch sync.Type {
        // procMount:该阶段负责挂载容器的 /proc 文件系统。在此阶段,
        // 容器的 /proc 目录被挂载到宿主机的 /proc/<容器ID>/ 目录下,
        // 这样容器就可以访问宿主机的进程信息。
        case procMountPlease:
           //...
        // 允许管理员限制容器中的进程只能使用一小组特定的系统调用,从而增加容器的安全性。
        // 在 procSeccomp 阶段,容器的 Seccomp 配置被加载和应用。
        case procSeccomp:
          
         // 在容器的生命周期中,当容器的主进程(entrypoint)准备好接受请求并正常运行时,
         // 会触发 procReady 阶段。这可以是容器启动的最后一个阶段,表示容器已准备就绪。
        case procReady:
            seenProcReady = true
            
            // rlimits机制通过设置资源限制参数来管理进程可使用的资源量
            // Set rlimits, this has to be done here because we lose permissions
            // to raise the limits once we enter a user-namespace
            if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
                return fmt.Errorf("error setting rlimits for ready process: %w", err)
            }
            
            // 设置container启动时间
            // generate a timestamp indicating when the container was started
            p.container.created = time.Now().UTC()
            p.container.state = &createdState{
                c: p.container,
            }

           
            // 更新容器组状态
            state, uerr := p.container.updateState(p)
            if uerr != nil {
                return fmt.Errorf("unable to store init state: %w", uerr)
            }
            p.container.initProcessStartTime = state.InitProcessStartTime

            // Sync with child.
            if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
                return err
            }
        //procHooks 阶段是一个扩展点,允许用户在容器的不同生命周期阶段执行自定义脚本或操作。
        // 这些脚本可以在容器的不同事件(如创建、启动、停止等)发生时触发,以便进行一些额外的处理或操作。
        case procHooks:
            // Sync with child.
            if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil {
                return err
            }
        default:
            return errors.New("invalid JSON payload from child")
        }
        return nil
    })
       // 关闭与init 的通信管道
    if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
        return err
    }
    
    if !seenProcReady && ierr == nil {
        ierr = errors.New("procReady not received")
    }
    if ierr != nil {
        return fmt.Errorf("error during container init: %w", ierr)
    }
    return nil
}

runc init

runc init 是由 runc create 过程启动的子进程中执行的命令,该进程为容器中的init进程或附加进程

runc.init

# runc/init.go

package main

import (
    "os"

    "github.com/opencontainers/runc/libcontainer"
    // 这个包非常重要,是 runc init 启动的基石
    _ "github.com/opencontainers/runc/libcontainer/nsenter"
)

func init() {
    if len(os.Args) > 1 && os.Args[1] == "init" {
        // This is the golang entry point for runc init, executed
        // before main() but after libcontainer/nsenter's nsexec().
        libcontainer.Init()
    }
}

nsenter

nsenter 实际是对c 语言接口 nsexec 的封装

这个代码利用了 GCC 的 constructor 特性,init 会在 runtimel.main()(不是 main.main()) 函数之前执行, 这样保证了启动时是单线程的,这一点很重要。因为 linux 不允许在多线程中通过 setns 设置 user namespace。

// libcontainer/nsenter/nsenter.go
//go:build linux && !gccgo
// +build linux,!gccgo

package nsenter

/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
    nsexec();
}
*/
import "C"

nsexec.nsexec

当使用 nsexec 工具时,创建孙进程的设计也与下面几个方面相关: 命令执行的隔离性:通过创建孙进程,nsexec 工具确保了用户指定的命令在新的命名空间中执行时, 不会继承父进程的权限和资源。这样可以保持命令在命名空间中的隔离性,避免可能的权限泄漏或资源冲突。 进程层次结构清晰:创建孙进程使得进程层次结构更加清晰和直观。 父进程作为 nsexec 工具的管理进程,子进程作为新命名空间中的中间进程, 而孙进程则是实际执行用户指定命令的进程。这种层次结构的设计使得进程间的关系更加明确,方便管理和跟踪。 命令执行的控制:通过子进程等待孙进程的完成,nsexec 工具可以控制命令的执行过程。 子进程等待孙进程的退出状态,以便获取命令的执行结果。这样可以在命令完成后进行后续处理, 如输出结果的收集、错误处理等。 错误处理和资源回收:创建孙进程也有助于错误处理和资源回收。 如果命令执行出现错误或异常情况,孙进程可以及时退出并返回相应的错误状态。 父进程和子进程可以在命令执行结束后进行必要的资源清理和回收,以确保系统的稳定性和资源管理。 需要注意的是,nsexec 工具的实现可能会根据具体的使用场景和需求而有所差异。 上述描述提供了一般情况下的常见设计和原理,但具体实现可能会有细微的差别。 如果你对 nsexec 工具的详细实现感兴趣,建议查阅相关的文档或源代码,以获取更准确和具体的信息

进程同步标志扩展,以下是与进程同步相关的常量和它们的含义:

SYNC_USERMAP_PLS:这是一个与用户映射(user namespace)相关的同步操作。 在容器中,用户命名空间被用于隔离用户和组标识符,以提供额外的安全性。

SYNC_USERMAP_PLS 表示进行用户映射同步的请求。

SYNC_RECVPID_PLS:这个常量表示接收进程 ID(PID)的同步请求。 在容器中,接收进程 ID 是指在容器内部创建的进程接收到来自宿主机的进程 ID。 通过进行同步,容器内部的进程可以感知到宿主机的进程 ID。

SYNC_TIMEOFFSETS_PLS:这是一个与时间偏移同步相关的常量。 在容器中,由于不同的时间管理机制,容器内部的时间可能与宿主机的时间存在差异。

SYNC_TIMEOFFSETS_PLS 表示进行时间偏移同步的请求,以确保容器内部的时间与宿主机保持一致。

SYNC_CHILD_FINISH:这个常量表示子进程完成同步的请求。 在容器中,当一个子进程完成其任务并退出时,它需要通知容器管理器(如 runc)以便进行相应的处理。 SYNC_CHILD_FINISH 表示子进程完成同步的请求,以便容器管理器能够及时处理相关事务。

//libcontainer/nsenter/nsexec.c
void nsexec(void)
{
    int pipenum;
    jmp_buf env;
    int sync_child_pipe[2], sync_grandchild_pipe[2];
    struct nlconfig_t config = { 0 };

    /*
     * Setup a pipe to send logs to the parent. This should happen
     * first, because bail will use that pipe.
     */
    setup_logpipe();

    /*
     * Get the init pipe fd from the environment. The init pipe is used to
     * read the bootstrap data and tell the parent what the new pids are
     * after the setup is done.
     */
     
    //  从环境变量 _LIBCONTAINER_INITPIPE 中取得 child pipe 的 fd 编号
    pipenum = getenv_int("_LIBCONTAINER_INITPIPE");
    if (pipenum < 0) {
        // 由于正常启动的 runc 是没有这个环境变量的,
        // 所以这里会直接返回,然后就开始正常的执行 go 程序了
        /* We are not a runc init. Just return to go runtime. */
        return;
    }

    /*
     * Inform the parent we're past initial setup.
     * For the other side of this, see initWaiter.
     */
    if (write(pipenum, "", 1) != 1)
        bail("could not inform the parent we are past initial setup");

    write_log(DEBUG, "=> nsexec container setup");


    // // 从 child pipe 中读取 namespace config
    /* Parse all of the netlink configuration. */
    nl_parse(pipenum, &config);

    /* Set oom_score_adj. This has to be done before !dumpable because
     * /proc/self/oom_score_adj is not writeable unless you're an privileged
     * user (if !dumpable is set). All children inherit their parent's
     * oom_score_adj value on fork(2) so this will always be propagated
     * properly.
     */
    //  设置 oom score,这个只能在特权模式下设置,所以在这里就要修改完成
    update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);

    /*
     * Make the process non-dumpable, to avoid various race conditions that
     * could cause processes in namespaces we're joining to access host
     * resources (or potentially execute code).
     *
     * However, if the number of namespaces we are joining is 0, we are not
     * going to be switching to a different security context. Thus setting
     * ourselves to be non-dumpable only breaks things (like rootless
     * containers), which is the recommendation from the kernel folks.
     */
    if (config.namespaces) {
        write_log(DEBUG, "set process as non-dumpable");
        if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
            bail("failed to set process as non-dumpable");
    }
    // 创建和子进程通信的 pipe
    /* Pipe so we can tell the child when we've finished setting up. */
    if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
        bail("failed to setup sync pipe between parent and child");

    /*
     * We need a new socketpair to sync with grandchild so we don't have
     * race condition with child.
     */
    // 创建和孙进程通信的 pipe
    if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
        bail("failed to setup sync pipe between parent and grandchild");

    /* TODO: Currently we aren't dealing with child deaths properly. */

    /*
     * Okay, so this is quite annoying.
     *
     * In order for this unsharing code to be more extensible we need to split
     * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
     * would be if we did clone(CLONE_NEWUSER) and the other namespaces
     * separately, but because of SELinux issues we cannot really do that. But
     * we cannot just dump the namespace flags into clone(...) because several
     * usecases (such as rootless containers) require more granularity around
     * the namespace setup. In addition, some older kernels had issues where
     * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
     * handle this while also dealing with SELinux so we choose SELinux support
     * over broken kernel support).
     *
     * However, if we unshare(2) the user namespace *before* we clone(2), then
     * all hell breaks loose.
     *
     * The parent no longer has permissions to do many things (unshare(2) drops
     * all capabilities in your old namespace), and the container cannot be set
     * up to have more than one {uid,gid} mapping. This is obviously less than
     * ideal. In order to fix this, we have to first clone(2) and then unshare.
     *
     * Unfortunately, it's not as simple as that. We have to fork to enter the
     * PID namespace (the PID namespace only applies to children). Since we'll
     * have to double-fork, this clone_parent() call won't be able to get the
     * PID of the _actual_ init process (without doing more synchronisation than
     * I can deal with at the moment). So we'll just get the parent to send it
     * for us, the only job of this process is to update
     * /proc/pid/{setgroups,uid_map,gid_map}.
     *
     * And as a result of the above, we also need to setns(2) in the first child
     * because if we join a PID namespace in the topmost parent then our child
     * will be in that namespace (and it will not be able to give us a PID value
     * that makes sense without resorting to sending things with cmsg).
     *
     * This also deals with an older issue caused by dumping cloneflags into
     * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
     * we have to unshare(2) before clone(2) in order to do this. This was fixed
     * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
     * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
     * aware, the last mainline kernel which had this bug was Linux 3.12.
     * However, we cannot comment on which kernels the broken patch was
     * backported to.
     *
     * -- Aleksa "what has my life come to?" Sarai
     */
    // setjmp 将当前执行位置的环境保存下来,用于多进程环境下的程序跳转
    // 第一次执行的时候 setjmp 返回 0,对应 JUMP_PARENT
    switch (setjmp(env)) {
        /*
         * Stage 0: We're in the parent. Our job is just to create a new child
         *          (stage 1: STAGE_CHILD) process and write its uid_map and
         *          gid_map. That process will go on to create a new process, then
         *          it will send us its PID which we will send to the bootstrap
         *          process.
         */
    case STAGE_PARENT:{
            int len;
            pid_t stage1_pid = -1, stage2_pid = -1;
            bool stage1_complete, stage2_complete;

            /* For debugging. */
            current_stage = STAGE_PARENT;
            
            // 设置当前进程名,用于调试
            prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
            write_log(DEBUG, "~> nsexec stage-0");

            /* Start the process of getting a container. */
            write_log(DEBUG, "spawn stage-1");
             // clone_parent 创建了和当前进程完全一致的一个进程(子进程)
             // 在 clone_parent 中,通过 longjmp() 跳转到 env 保存的位置
             // 并且 setjmp 返回值为 JUMP_CHILD
             // 这样这个子进程就会根据 switch 执行到 JUMP_CHILD 分支
             // 而当前 runc init 和 子 runc init 之间通过上面创建的
             // sync_child_pipe 进行同步通信
            stage1_pid = clone_parent(&env, STAGE_CHILD);
            if (stage1_pid < 0)
                bail("unable to spawn stage-1");

            syncfd = sync_child_pipe[1];
            if (close(sync_child_pipe[0]) < 0)
                bail("failed to close sync_child_pipe[0] fd");

            /*
             * State machine for synchronisation with the children. We only
             * return once both the child and grandchild are ready.
             */
            write_log(DEBUG, "-> stage-1 synchronisation loop");
            stage1_complete = false;
            while (!stage1_complete) {
                enum sync_t s;
                //  通过 sync_child_pipe 循环读取来自子进程的消息 
                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with stage-1: next state");

                switch (s) {
                case SYNC_USERMAP_PLS:
                    write_log(DEBUG, "stage-1 requested userns mappings");

                    /*
                     * Enable setgroups(2) if we've been asked to. But we also
                     * have to explicitly disable setgroups(2) if we're
                     * creating a rootless container for single-entry mapping.
                     * i.e. config.is_setgroup == false.
                     * (this is required since Linux 3.19).
                     *
                     * For rootless multi-entry mapping, config.is_setgroup shall be true and
                     * newuidmap/newgidmap shall be used.
                     */
                    if (config.is_rootless_euid && !config.is_setgroup)
                        update_setgroups(stage1_pid, SETGROUPS_DENY);
                    // 这里设置 user map,因为子进程修改自身的 user namespace 之后,
                    // 就没有权限再设置 user map 了
                    // 主要为用来权限控制,可以参考 rootless 以及 usermape 技术
                    /* Set up mappings. */
                    update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
                    update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);

                    s = SYNC_USERMAP_ACK;
                    if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                        sane_kill(stage1_pid, SIGKILL);
                        sane_kill(stage2_pid, SIGKILL);
                        bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
                    }
                    break;
                case SYNC_RECVPID_PLS:
                    write_log(DEBUG, "stage-1 requested pid to be forwarded");
                    
                    // 接收孙进程(还是 runc init)的 pid

                    /* Get the stage-2 pid. */
                    if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
                        sane_kill(stage1_pid, SIGKILL);
                        bail("failed to sync with stage-1: read(stage2_pid)");
                    }
                    
                    // 向子进程发送 SYNC_RECVPID_ACK,表示处理完成
                    /* Send ACK. */
                    s = SYNC_RECVPID_ACK;
                    if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                        sane_kill(stage1_pid, SIGKILL);
                        sane_kill(stage2_pid, SIGKILL);
                        bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
                    }

                    /*
                     * Send both the stage-1 and stage-2 pids back to runc.
                     * runc needs the stage-2 to continue process management,
                     * but because stage-1 was spawned with CLONE_PARENT we
                     * cannot reap it within stage-0 and thus we need to ask
                     * runc to reap the zombie for us.
                     */
                    write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
                          stage1_pid, stage2_pid);
                    // 通过容器外传进来的 child pipe 把子和孙进程 PID,写回去,然后让容器外的 runc 接管 PID
                    // 这个是因为 clone_parent 的时候参数传了 CLONE_PARENT,
                    // 导致子孙的父进程都是容器外的那个 runc
                    // 所以当前进程无法接管这些 PID
                    len =
                        dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
                            stage2_pid);
                    if (len < 0) {
                        sane_kill(stage1_pid, SIGKILL);
                        sane_kill(stage2_pid, SIGKILL);
                        bail("failed to sync with runc: write(pid-JSON)");
                    }
                    break;
                case SYNC_TIMEOFFSETS_PLS:
                
                //这是一个与时间偏移同步相关的常量。
                //在容器中,由于不同的时间管理机制,容器内部的时间可能与宿主机的时间存在差异。
                //SYNC_TIMEOFFSETS_PLS 表示进行时间偏移同步的请求,
                // 以确保容器内部的时间与宿主机保持一致。
                    write_log(DEBUG, "stage-1 requested timens offsets to be configured");
                    update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len);
                    s = SYNC_TIMEOFFSETS_ACK;
                    if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                        sane_kill(stage1_pid, SIGKILL);
                        bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)");
                    }
                    break;
                case SYNC_CHILD_FINISH:
                // 这个常量表示子进程完成同步的请求。
                // 在容器中,当一个子进程完成其任务并退出时,
                // 它需要通知容器管理器(如 runc)以便进行相应的处理。
                // SYNC_CHILD_FINISH 表示子进程完成同步的请求,
                // 以便容器管理器能够及时处理相关事务。
                    write_log(DEBUG, "stage-1 complete");
                    stage1_complete = true;
                    break;
                default:
                    bail("unexpected sync value: %u", s);
                }
            }
            write_log(DEBUG, "<- stage-1 synchronisation loop");

            /* Now sync with grandchild. */
            syncfd = sync_grandchild_pipe[1];
            if (close(sync_grandchild_pipe[0]) < 0)
                bail("failed to close sync_grandchild_pipe[0] fd");

            write_log(DEBUG, "-> stage-2 synchronisation loop");
            stage2_complete = false;
            while (!stage2_complete) {
            // 等待孙进程处理完毕
                enum sync_t s;

                write_log(DEBUG, "signalling stage-2 to run");
                s = SYNC_GRANDCHILD;
                if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                    sane_kill(stage2_pid, SIGKILL);
                    bail("failed to sync with child: write(SYNC_GRANDCHILD)");
                }

                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with child: next state");

                switch (s) {
                case SYNC_CHILD_FINISH:
                    write_log(DEBUG, "stage-2 complete");
                    stage2_complete = true;
                    break;
                default:
                    bail("unexpected sync value: %u", s);
                }
            }
            write_log(DEBUG, "<- stage-2 synchronisation loop");
            write_log(DEBUG, "<~ nsexec stage-0");
            exit(0);
        }
        break;

        /*
         * Stage 1: We're in the first child process. Our job is to join any
         *          provided namespaces in the netlink payload and unshare all of
         *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
         *          we will ask our parent (stage 0) to set up our user mappings
         *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
         *          PID namespace. We then send the child's PID to our parent
         *          (stage 0).
         */
    case STAGE_CHILD:{
            pid_t stage2_pid = -1;
            enum sync_t s;

            /* For debugging. */
            current_stage = STAGE_CHILD;

            /* We're in a child and thus need to tell the parent if we die. */
            syncfd = sync_child_pipe[0];
            if (close(sync_child_pipe[1]) < 0)
                bail("failed to close sync_child_pipe[1] fd");

            /* For debugging. */
            prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
            write_log(DEBUG, "~> nsexec stage-1");

            /*
             * We need to setns first. We cannot do this earlier (in stage 0)
             * because of the fact that we forked to get here (the PID of
             * [stage 2: STAGE_INIT]) would be meaningless). We could send it
             * using cmsg(3) but that's just annoying.
             */
            //  通过 setns 加入现有的 namespace
            if (config.namespaces)
                join_namespaces(config.namespaces);

            /*
             * Deal with user namespaces first. They are quite special, as they
             * affect our ability to unshare other namespaces and are used as
             * context for privilege checks.
             *
             * We don't unshare all namespaces in one go. The reason for this
             * is that, while the kernel documentation may claim otherwise,
             * there are certain cases where unsharing all namespaces at once
             * will result in namespace objects being owned incorrectly.
             * Ideally we should just fix these kernel bugs, but it's better to
             * be safe than sorry, and fix them separately.
             *
             * A specific case of this is that the SELinux label of the
             * internal kern-mount that mqueue uses will be incorrect if the
             * UTS namespace is cloned before the USER namespace is mapped.
             * I've also heard of similar problems with the network namespace
             * in some scenarios. This also mirrors how LXC deals with this
             * problem.
             */
             // 如果 clone flag 里有 CLONE_NEWUSER,说明需要创建新的 user namespace,
             // 此处调用 unshare 进行了处理
            if (config.cloneflags & CLONE_NEWUSER) {
                try_unshare(CLONE_NEWUSER, "user namespace");
                config.cloneflags &= ~CLONE_NEWUSER;

                /*
                 * We need to set ourselves as dumpable temporarily so that the
                 * parent process can write to our procfs files.
                 */
                if (config.namespaces) {
                    write_log(DEBUG, "temporarily set process as dumpable");
                    if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
                        bail("failed to temporarily set process as dumpable");
                }

                /*
                 * We don't have the privileges to do any mapping here (see the
                 * clone_parent rant). So signal stage-0 to do the mapping for
                 * us.
                 */
                write_log(DEBUG, "request stage-0 to map user namespace");
                //  等待父 runc init 配置 user map
                s = SYNC_USERMAP_PLS;
                if (write(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");

                /* ... wait for mapping ... */
                write_log(DEBUG, "waiting stage-0 to complete the mapping of user namespace");
                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
                if (s != SYNC_USERMAP_ACK)
                    bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);

                /* Revert temporary re-dumpable setting. */
                if (config.namespaces) {
                    write_log(DEBUG, "re-set process as non-dumpable");
                    if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
                        bail("failed to re-set process as non-dumpable");
                }

                /* Become root in the namespace proper. */
                //  设置当前进程的 uid 为 0,即容器内的 root
                if (setresuid(0, 0, 0) < 0)
                    bail("failed to become root in user namespace");
            }

            /*
             * Unshare all of the namespaces. Now, it should be noted that this
             * ordering might break in the future (especially with rootless
             * containers). But for now, it's not possible to split this into
             * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
             *
             * Note that we don't merge this with clone() because there were
             * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
             * was broken, so we'll just do it the long way anyway.
             */
            try_unshare(config.cloneflags, "remaining namespaces");

            if (config.timensoffset) {
                write_log(DEBUG, "request stage-0 to write timens offsets");

                s = SYNC_TIMEOFFSETS_PLS;
                if (write(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)");

                if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                    bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)");
                if (s != SYNC_TIMEOFFSETS_ACK)
                    bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
            }

            /*
             * TODO: What about non-namespace clone flags that we're dropping here?
             *
             * We fork again because of PID namespace, setns(2) or unshare(2) don't
             * change the PID namespace of the calling process, because doing so
             * would change the caller's idea of its own PID (as reported by getpid()),
             * which would break many applications and libraries, so we must fork
             * to actually enter the new PID namespace.
             */
            write_log(DEBUG, "spawn stage-2");
            
            // 创建孙进程,当前进程已经完成了 namespace 的设置,孙进程会继承这些设置
            stage2_pid = clone_parent(&env, STAGE_INIT);
            if (stage2_pid < 0)
                bail("unable to spawn stage-2");

            /* Send the child to our parent, which knows what it's doing. */
            write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
            s = SYNC_RECVPID_PLS;
            if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                sane_kill(stage2_pid, SIGKILL);
                bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
            }
            //  将孙进程 PID 传给父 runc init
            if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
                sane_kill(stage2_pid, SIGKILL);
                bail("failed to sync with parent: write(stage2_pid)");
            }

            /* ... wait for parent to get the pid ... */
            if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
                sane_kill(stage2_pid, SIGKILL);
                bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
            }
            if (s != SYNC_RECVPID_ACK) {
                sane_kill(stage2_pid, SIGKILL);
                bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
            }

            write_log(DEBUG, "signal completion to stage-0");
            s = SYNC_CHILD_FINISH;
            // 发送 SYNC_CHILD_FINISH; 给父 runc init
            if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
                sane_kill(stage2_pid, SIGKILL);
                bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
            }

            /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
            write_log(DEBUG, "<~ nsexec stage-1");
            exit(0);
        }
        break;

        /*
         * Stage 2: We're the final child process, and the only process that will
         *          actually return to the Go runtime. Our job is to just do the
         *          final cleanup steps and then return to the Go runtime to allow
         *          init_linux.go to run.
         */
    case STAGE_INIT:{
         // 孙 runc init 是真正启动容器 entrypoint 的进程,并且在启动之前,
         // 进行最后的环境准备工作
            /*
             * We're inside the child now, having jumped from the
             * start_child() code after forking in the parent.
             */
            enum sync_t s;

            /* For debugging. */
            current_stage = STAGE_INIT;

            /* We're in a child and thus need to tell the parent if we die. */
            syncfd = sync_grandchild_pipe[0];
            if (close(sync_grandchild_pipe[1]) < 0)
                bail("failed to close sync_grandchild_pipe[1] fd");

            if (close(sync_child_pipe[0]) < 0)
                bail("failed to close sync_child_pipe[0] fd");

            /* For debugging. */
            prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
            write_log(DEBUG, "~> nsexec stage-2");

            if (read(syncfd, &s, sizeof(s)) != sizeof(s))
                bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
            if (s != SYNC_GRANDCHILD)
                bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);

            if (setsid() < 0)
                bail("setsid failed");

            if (setuid(0) < 0)
                bail("setuid failed");

            if (setgid(0) < 0)
                bail("setgid failed");

            if (!config.is_rootless_euid && config.is_setgroup) {
                if (setgroups(0, NULL) < 0)
                    bail("setgroups failed");
            }

            write_log(DEBUG, "signal completion to stage-0");
            s = SYNC_CHILD_FINISH;
            if (write(syncfd, &s, sizeof(s)) != sizeof(s))
                bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");

            /* Close sync pipes. */
            if (close(sync_grandchild_pipe[0]) < 0)
                bail("failed to close sync_grandchild_pipe[0] fd");

            /* Free netlink data. */
            nl_free(&config);

            /* Finish executing, let the Go runtime take over. */
            write_log(DEBUG, "<= nsexec container setup");
            write_log(DEBUG, "booting up go runtime ...");
            
            // 此时,父 / 祖父 runc init 都退出了(可能会有时差)
            // 但是当前进程是不能直接退出的,所以这里单纯的 return,
            // 然后开始执行 go 代码
            return;
        }
        break;
    default:
        bail("unexpected jump value");
    }

    /* Should never be reached. */
    bail("should never be reached");
}

init_linux.startInitialization

// Normally, this function does not return. If it returns, with or without an
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
    // Get the synchronisation pipe.
    //  获取 init pipe, 这里是之前提到 init-c 这个文件描述fd
    envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
    syncPipeFd, err := strconv.Atoi(envSyncPipe)
    if err != nil {
        return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
    }
    
    // 通过fd 建立pipe 与父进程通信
    syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
    defer syncPipe.Close()

    
    // 返回一个 starndar 的 linuxStandardInit, 如果我们是执行exec的话, 则返回linuxSetnsInit
    // If init succeeds, it will not return, hence none of the defers will be called.
    return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe)
}

init_linux.containerInit

返回一个 starndar 的 linuxStandardInit, 如果我们是执行exec的话, 则返回linuxSetnsInit

func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe *os.File) error {
    if err := populateProcessEnvironment(config.Env); err != nil {
        return err
    }

    // Clean the RLIMIT_NOFILE cache in go runtime.
    // Issue: https://github.com/opencontainers/runc/issues/4195
    maybeClearRlimitNofileCache(config.Rlimits)

    switch t {
    case initSetns:
        i := &linuxSetnsInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            pidfdSocket:   pidfdSocket,
            config:        config,
            logPipe:       logPipe,
            dmzExe:        dmzExe,
        }
        return i.Init()
    case initStandard:
        i := &linuxStandardInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            pidfdSocket:   pidfdSocket,
            parentPid:     unix.Getppid(),
            config:        config,
            fifoFile:      fifoFile,
            logPipe:       logPipe,
            dmzExe:        dmzExe,
        }
        return i.Init()
    }
    return fmt.Errorf("unknown init type %q", t)
}

runc start

容器对创建的容器启动,是靠读取 新创建容器的exec.fifio,解除init进程阻塞使其执行后续流程来完成的。

Container.exec

// libcontainer/container_linux.go
func (c *Container) exec() error {
    path := filepath.Join(c.stateDir, execFifoFilename)
    pid := c.initProcess.pid()
    // 读取 /run/runc//exec.fifo 管道,由于socketpair管道特性,
    // 父进程(init进程)被读取信息后便不会阻塞,继续往下执行,关闭socket
    blockingFifoOpenCh := awaitFifoOpen(path)
    for {
        select {
        case result := <-blockingFifoOpenCh:
        // handleFifoResult 最后读完内容后会删除掉 exec.fifo
            return handleFifoResult(result)

        case <-time.After(time.Millisecond * 100):
            stat, err := system.Stat(pid)
            if err != nil || stat.State == system.Zombie {
                // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
                // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
                if err := handleFifoResult(fifoOpen(path, false)); err != nil {
                    return errors.New("container process is already dead")
                }
                return nil
            }
        }
    }
}

runc exec

runc exec 是向容器的init 进程所在 空间增加启动额外进程。

应对如 docker exec 在容器中启动 命令行终端进程的场景

setnsProcess.start

cmd终于的开始执行了,执行runc initinit 程序会调用 nsexec.c 的代码(这也是一个子进程,而且是在init进程前启动,设置ns后就会退出);通过 _LIBCONTAINER_INITTYPE=setns 这个环境变量判别用什么模式进行setns,如果是standard 则是使用clone namespace 为容器建立新的namespace,这里我们是setns所以是为容器指定了我们需要进入的进程namespace。这一步的信息传递是通过socket与nsexec.c 这个程序交互,进程间通信技术完成信息传递;容器启动的需要的namespace 数据放到setnsProcess.bootstrapData内;

func (p *setnsProcess) start() (retErr error) {
    defer p.comm.closeParent()

    if p.process.IOPriority != nil {
        if err := setIOPriority(p.process.IOPriority); err != nil {
            return err
        }
    }

    // get the "before" value of oom kill count
    oom, _ := p.manager.OOMKillCount()
    
    //  执行命令 runc init
    err := p.cmd.Start()
    // close the child-side of the pipes (controlled by child)
    p.comm.closeChild()
    if err != nil {
        return fmt.Errorf("error starting setns process: %w", err)
    }
    // ...

    // 将namespace 数据传输到nsenter 上
    if p.bootstrapData != nil {
        if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
            return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
        }
    }
    err = <-waitInit
    if err != nil {
        return err
    }
    // 等待exec 设置namespace 数据, 并且通过消息管道返回父进程号,
    // 然后设置process属性为runc init 的进程号,方便后面对process的操作;
    if err := p.execSetns(); err != nil {
        return fmt.Errorf("error executing setns process: %w", err)
    }
    //  把runc init进程加入到, 已有容器的cgroup中
    for _, path := range p.cgroupPaths {
        if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
            // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
            // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
            // Try to join the cgroup of InitProcessPid.
            if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
                initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
                initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
                if initCgErr == nil {
                    if initCgPath, ok := initCg[""]; ok {
                        initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
                        logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
                            p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
                        // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
                        err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
                    }
                }
            }
            if err != nil {
                return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
            }
        }
    }
    // ...
    if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
        return err
    }
    if !seenProcReady && ierr == nil {
        ierr = errors.New("procReady not received")
    }
    // Must be done after Shutdown so the child will exit and we can wait for it.
    if ierr != nil {
        _, _ = p.wait()
        return ierr
    }
    return nil
}

pause

Container.Pause

可以看到容器停止,主要用到了cgroup的机制做冻结。

// Pause pauses the container, if its state is RUNNING or CREATED, changing
// its state to PAUSED. If the state is already PAUSED, does nothing.
func (c *Container) Pause() error {
    c.m.Lock()
    defer c.m.Unlock()
    status, err := c.currentStatus()
    if err != nil {
        return err
    }
    switch status {
    case Running, Created:
    
    //冻结
        if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
            return err
        }
        return c.state.transition(&pausedState{
            c: c,
        })
    }
    return ErrNotRunning
}

resume

Container.Resume

// Resume resumes the execution of any user processes in the
// container before setting the container state to RUNNING.
// This is only performed if the current state is PAUSED.
// If the Container state is RUNNING, does nothing.
func (c *Container) Resume() error {
    c.m.Lock()
    defer c.m.Unlock()
    status, err := c.currentStatus()
    if err != nil {
        return err
    }
    if status != Paused {
        return ErrNotPaused
    }
    // 可以看到容器停止后继续执行也是使用了cgroup的机制
    if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
        return err
    }
    return c.state.transition(&runningState{
        c: c,
    })
}

参考文档

  • https://juejin.cn/post/6903527508784873485

  • https://www.51cto.com/article/744467.html

  • https://blog.csdn.net/m0_45406092/article/details/130660743

  • https://imkira.com/runc/

  • https://fankangbest.github.io/2017/11/22/runc%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90(%E4%B8%80)-create%E5%92%8Cstart%E6%B5%81%E7%A8%8B-v1-0-0-rc2/

  • https://fankangbest.github.io/2017/11/22/runc%E6%BA%90%E7%A0%81%E5%88%86%E6%9E%90(%E4%B8%80)-create%E5%92%8Cstart%E6%B5%81%E7%A8%8B-v1-0-0-rc2/

  • https://www.cyisme.top/cloud_native/containerd/run/

  • https://jimmysong.io/book/kubernetes-handbook/objects/pause-container/

  • https://github.com/rfyiamcool/notes/blob/main/kubernetes_pause_code.md

  • https://blog.csdn.net/zhonglinzhang/article/details/99458561

  • https://arthurchiao.art/blog/what-happens-when-k8s-creates-pods-5-zh/

  • https://zhuanlan.zhihu.com/p/693687096

  • https://juejin.cn/post/6903527508784873485

你可能感兴趣的:(kubernetes源码分析,kubernetes,容器)