Linux: 多核 CPU 启动流程简析

文章目录

  • 1. 前言
  • 2. 分析背景
  • 3. CPU 多核启动流程
    • 3.1 支持 PSCI 平台的 CPU 启动流程
      • 3.1.1 BOOT CPU 启动流程
      • 3.1.2 非 BOOT CPU 启动流程
    • 3.2 不支持 PSCI 平台的 CPU 启动流程
      • 3.2.1 BOOT CPU 启动流程
      • 3.2.2 非 BOOT CPU 启动流程
    • 3.3 注册 CPU 设备到系统
  • 4. CPU 热插拔管理
    • 4.1 创建 CPU 热插拔管理线程
      • 4.1.1 创建 BOOT CPU 热插拔管理线程
      • 4.1.2 创建非 BOOT CPU 热插拔管理线程
    • 4.2 CPU 热插拔过程
      • 4.2.1 CPU offline
      • 4.2.2 CPU online
      • 4.2.3 CPU 热插拔管理线程的唤醒
  • 5. 参考资料

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 分析背景

本文基于 ARMv7 架构 + linux-4.14.132内核代码 进行分析。对涉及的 ATF(Arm Trusted Firmware) 以及 ARMv7 CPU HYP 模式 知识不做展开,读者可自行阅读相关资料进行了解。

3. CPU 多核启动流程

ARMv7 架构下,SoC 的一般启动流程大概如下:

上电 --> SoC Boot ROM --> SPL --> U-BOOT --> Linux 内核

在进入 Linux 内核 之前,通常只启动了一个 BOOT CPU (通常是 CPU 0),而其它的 CPU 核处于待机状态。我们的分析,直接从 Linux 内核 入口开始,我们也不会讨论 Linux 内核 的解压过程。

3.1 支持 PSCI 平台的 CPU 启动流程

3.1.1 BOOT CPU 启动流程

从内核链接脚本 arch/arm/kernel/vmlinux.lds.S 的如下片段:

/* include/asm-generic/vmlinux.lds.h */

/* Section used for early init (in .S files) */
#define HEAD_TEXT  *(.head.text)
/* arch/arm/kernel/vmlinux.lds.S */

...
OUTPUT_ARCH(arm)
ENTRY(stext) /* 内核入口 */

...

SECTIONS
{
	...
	. = PAGE_OFFSET + TEXT_OFFSET;
	.head.text : {
		_text = .;
		HEAD_TEXT
	}
	...
}

了解到内核的入口在 arch/arm/kernel/head.S 中:

/* include/linux/init.h */

/* For assembly routines */
#define __HEAD		.section	".head.text","ax"
/* arch/arm/kernel/head.S */

/*
 * Kernel startup entry point.
 * ---------------------------
 *
 * This is normally called from the decompressor code.  The requirements
 * are: MMU = off, D-cache = off, I-cache = dont care, r0 = 0,
 * r1 = machine nr, r2 = atags or dtb pointer.
 *
 * This code is mostly position independent, so if you link the kernel at
 * 0xc0008000, you call this at __pa(0xc0008000).
 *
 * See linux/arch/arm/tools/mach-types for the complete list of machine
 * numbers for r1.
 *
 * We're trying to keep crap to a minimum; DO NOT add any machine specific
 * crap here - that's what the boot loader (or in extreme, well justified
 * circumstances, zImage) is for.
 */
	.arm

	__HEAD
ENTRY(stext)
	ARM_BE8(setend	be )			@ ensure we are in BE8 mode

	...
	
#ifdef CONFIG_ARM_VIRT_EXT
	bl	__hyp_stub_install
#endif
	@ ensure svc mode and all interrupts masked
	safe_svcmode_maskall r9

	mrc	p15, 0, r9, c0, c0		@ get processor id
	bl	__lookup_processor_type		@ r5=procinfo r9=cpuid
	movs	r10, r5				@ invalid processor (r5=0)?
	...

#ifdef CONFIG_ARM_LPAE
	mrc	p15, 0, r3, c0, c1, 4		@ read ID_MMFR0
	and	r3, r3, #0xf			@ extract VMSA support
	cmp	r3, #5				@ long-descriptor translation table format?
	...
#endif

	ldr	r8, =PLAT_PHYS_OFFSET		@ always constant in this case

	/*
	 * r1 = machine no, r2 = atags or dtb,
	 * r8 = phys_offset, r9 = cpuid, r10 = procinfo
	 */
	bl	__vet_atags
#ifdef CONFIG_SMP_ON_UP
	bl	__fixup_smp
#endif
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
	bl	__fixup_pv_table
#endif
	bl	__create_page_tables /* 建立内核页表 */

	ldr	r13, =__mmap_switched		@ address to jump to after
									@ mmu has been enabled
	badr	lr, 1f				@ return (PIC) address
#ifdef CONFIG_ARM_LPAE
	mov	r5, #0				@ high TTBR0
	mov	r8, r4, lsr #12			@ TTBR1 is swapper_pg_dir pfn
#else
	mov	r8, r4				@ set TTBR1 to swapper_pg_dir
#endif
	ldr	r12, [r10, #PROCINFO_INITFUNC]
	add	r12, r12, r10
	ret	r12
/*
 * The following fragment of code is executed with the MMU on in MMU mode,
 * and uses absolute addresses; this is not position independent.
 *
 *  r0  = cp#15 control register
 *  r1  = machine ID
 *  r2  = atags/dtb pointer
 *  r9  = processor ID
 */
	__INIT
__mmap_switched:
	adr	r3, __mmap_switched_data

	ldmia	r3!, {r4, r5, r6, r7}
	cmp	r4, r5				@ Copy data segment if needed
1:	cmpne	r5, r6
	ldrne	fp, [r4], #4
	strne	fp, [r5], #4
	bne	1b

	mov	fp, #0				@ Clear BSS (and zero fp)
1:	cmp	r6, r7
	strcc	fp, [r6],#4
	bcc	1b

 ARM(	ldmia	r3, {r4, r5, r6, r7, sp})
 THUMB(	ldmia	r3, {r4, r5, r6, r7}	)
 THUMB(	ldr	sp, [r3, #16]		)
	str	r9, [r4]			@ Save processor ID
	str	r1, [r5]			@ Save machine type
	str	r2, [r6]			@ Save atags pointer
	cmp	r7, #0
	strne	r0, [r7]			@ Save control register values
	b	start_kernel /* start_kernel() */
ENDPROC(__mmap_switched)

内核流程从汇编代码进入了 C 入口 start_kernel()

/* init/main.c */

asmlinkage __visible void __init start_kernel(void)
{
	...
	pr_notice("%s", linux_banner);
	setup_arch(&command_line);

	...

	sched_init();

	...
	
	/* Do the rest non-__init'ed, we're now alive */
	rest_init();
}

setup_arch() 中 解析 CPU DTS 配置, 以及 PSCI(Power State Coordination Interface) 初始化。看 CPU 相关的 DTS 配置:

/ {
	cpus {
		#address-cells = <1>;
		#size-cells = <0>;

		cpu0: cpu@0 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <0>;
			clocks = <&ccu CLK_CPUX>;
			clock-latency = <244144>; /* 8 32k periods */
			clock-frequency = <1200000000>;
		};

		cpu@1 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <1>;
			clock-frequency = <1200000000>;
		};

		cpu@2 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <2>;
			clock-frequency = <1200000000>;
		};

		cpu@3 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <3>;
			clock-frequency = <1200000000>;
		};
	};

	...
};
/* arch/arm/kernel/psci_smp.c */

const struct smp_operations psci_smp_ops __initconst = {
	.smp_boot_secondary	= psci_boot_secondary,
#ifdef CONFIG_HOTPLUG_CPU
	.cpu_disable		= psci_cpu_disable,
	.cpu_die		= psci_cpu_die,
	.cpu_kill		= psci_cpu_kill,
#endif
};
/* arch/arm/kernel/setup.c */

void __init setup_arch(char **cmdline_p)
{
	...
	
	/* 解析 "cpus" DTS 配置 */
	arm_dt_init_cpu_maps();
	/* ARM PSCI(Power State Coordinate Interface) 初始化 */
	psci_dt_init();
#ifdef CONFIG_SMP
	if (is_smp()) {
		if (!mdesc->smp_init || !mdesc->smp_init()) {
			if (psci_smp_available()) /* 如果 PSCI(Power State Coordinate Interface) 可用 */
				smp_set_ops(&psci_smp_ops); /* 使用 PSCI 的 smp_operations */
			else if (mdesc->smp)
				...
		}
		smp_init_cpus();
		smp_build_mpidr_hash();
	}
#endif

	...
}

ARM PSCI 用来管理 CPU 的 启动、关闭、休眠、重启等工作。我们先看 PSCI 配置相关的 DTS 配置:

/{
	cpus {
		...
	};
	...
	/* PSCI 配置 */
	psci {
		compatible = "arm,psci-1.0";
		method = "smc";
	};
};

有时候,PSCI DTS 配置可能是由 U-BOOT 动态插入的,所以你无法在内核的 DTS 中找到它。继续看 PSCI 的初始化:

/* drivers/firmware/psci.c */

static const struct of_device_id psci_of_match[] __initconst = {
	{ .compatible = "arm,psci",	.data = psci_0_1_init},
	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
	{ .compatible = "arm,psci-1.0",	.data = psci_0_2_init},
	{},
};

int __init psci_dt_init(void)
{
	struct device_node *np;

	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);

	/* 没有配置 "psci" 节点 */
	if (!np || !of_device_is_available(np))
		return -ENODEV;

	init_fn = (psci_initcall_t)matched_np->data;
	return init_fn(np); /* psci_0_1_init() */
}

static int __init psci_0_1_init(struct device_node *np)
{
	int err;
	
	err = get_set_conduit_method(np); /* 设置 发起 PSCI 功能接口 请求方式 (SMC) */
	
	...

	pr_info("Using PSCI v0.1 Function IDs from DT\n");

	if (!of_property_read_u32(np, "cpu_suspend", &id)) {
		psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
		psci_ops.cpu_suspend = psci_cpu_suspend;
	}

	if (!of_property_read_u32(np, "cpu_off", &id)) {
		psci_function_id[PSCI_FN_CPU_OFF] = id;
		psci_ops.cpu_off = psci_cpu_off;
	}

	if (!of_property_read_u32(np, "cpu_on", &id)) {
		psci_function_id[PSCI_FN_CPU_ON] = id;
		psci_ops.cpu_on = psci_cpu_on;
	}

	if (!of_property_read_u32(np, "migrate", &id)) {
		psci_function_id[PSCI_FN_MIGRATE] = id;
		psci_ops.migrate = psci_migrate;
	}

	...
	return err;
}

static int get_set_conduit_method(struct device_node *np)
{
	const char *method;
	
	pr_info("probing for conduit method from DT.\n");

	if (of_property_read_string(np, "method", &method)) {
		pr_warn("missing \"method\" property\n");
		return -ENXIO;
	}

	if (!strcmp("hvc", method)) {
		set_conduit(PSCI_CONDUIT_HVC);
	} else if (!strcmp("smc", method)) { /* 我们的 DTS 配置通过 SMC 指令发起 PSCI 功能请求 */
		set_conduit(PSCI_CONDUIT_SMC);
	} else {
		pr_warn("invalid \"method\" property: %s\n", method);
		return -EINVAL;
	}
	return 0;
}

static void set_conduit(enum psci_conduit conduit)
{
	switch (conduit) {
	...
	case PSCI_CONDUIT_SMC:
		invoke_psci_fn = __invoke_psci_fn_smc;
		break;
	...
	}

	psci_ops.conduit = conduit;
}

接下来看每 CPU idle 进程的初始工作:

/* kernel/sched/core.c */

void __init sched_init(void)
{
	...
	/*
	 * Make us the idle thread. Technically, schedule() should not be
	 * called from this thread, however somewhere below it might be,
	 * but because we are the idle thread, we just pick up running again
	 * when this runqueue becomes "idle".
	 */
	/* 初始化当前 CPU 的 idle 进程 */
	init_idle(current, smp_processor_id());
	...
}

/* 初始化 @cpu 的 idle 进程 */
void init_idle(struct task_struct *idle, int cpu)
{
	struct rq *rq = cpu_rq(cpu); /* @cpu 的运行队列 */

	...
	
	__sched_fork(0, idle);
	idle->state = TASK_RUNNING;
	idle->se.exec_start = sched_clock();
	idle->flags |= PF_IDLE;

#ifdef CONFIG_SMP
	set_cpus_allowed_common(idle, cpumask_of(cpu)); /* 限定 idle 进程到 @cpu 上运行 */
#endif

	...
	__set_task_cpu(idle, cpu);
	...

	rq->curr = rq->idle = idle; /* 设置 @cpu 运行队列当前进程为 idle */
	idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
	idle->on_cpu = 1;
#endif
	...

	init_idle_preempt_count(idle, cpu); /* 开启 @cpu 的抢占 */

	idle->sched_class = &idle_sched_class;
	...
#ifdef CONFIG_SMP
	/* 设置 idle 进程名为 "swapper/%d" */
	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}

BOOT CPU 的启动过程接近尾声了,接下就是其它 非 BOOT CPU 的启动的前期准备工作:

/* init/main.c */

static noinline void __ref rest_init(void)
{
	/* 做内核剩余初始化工作的内核线程:其它非 BOOT CPU 将从 kernel_init() 拉起 */
	pid = kernel_thread(kernel_init, NULL, CLONE_FS);
	...
	/* BOOT CPU 的 idle 进程 CPU 亲和性设置:限定到 BOOT CPU 上运行 */
	rcu_read_lock();
	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
	set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
	rcu_read_unlock();

	/* 创建并唤醒【用于创建内核线程的内核线程 kthreadd 】 */
	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
	...

	system_state = SYSTEM_SCHEDULING;

	complete(&kthreadd_done);

	schedule_preempt_disabled();
	cpu_startup_entry(CPUHP_ONLINE); /* BOOT CPU 进入其 idle 进程 */
}
/* kernel/sched/idle.c */

/* 在 BOOT CPU 上启动其 idle 进程 */
void cpu_startup_entry(enum cpuhp_state state)
{
	...
	while (1)
		do_idle();
}

到此,BOOT CPU 从已经启动完毕,进入了其 idle 进程。

3.1.2 非 BOOT CPU 启动流程

前面我们看到,从 BOOT CPU 启动了一个入口为 kernel_init() 的内核线程,它负责完成内核中剩余的初始化工作,其中就包括 非 BOOT CPU 的启动工作。我们来看 非 BOOT CPU 启动的细节。

kernel_init()
	kernel_init_freeable()
		smp_init() /*  启动其它 非 BOOT CPU */
/* kernel/smp.c */

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
	idle_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 idle 线程数据(task_struct等) 并初始化 */ 
	cpuhp_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 热插拔管理 内核线程 */

	pr_info("Bringing up secondary CPUs ...\n");

	/* 启动所有非 BOOT CPU ,逐个按顺序启动 */
	for_each_present_cpu(cpu) {
		if (num_online_cpus() >= setup_max_cpus)
			break;
		if (!cpu_online(cpu)) /* CPU 尚未启动 */
			cpu_up(cpu); /* 启动 CPU @cpu */
	}

	...
}

cpu_up() 启动一个 CPU:

/* kernel/cpu.c */

int cpu_up(unsigned int cpu)
{
	return do_cpu_up(cpu, CPUHP_ONLINE);
}

static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
{
	int err = 0;

	...

	err = _cpu_up(cpu, 0, target);
	...
	return err;
}

static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
	int ret = 0;
	struct task_struct *idle;
	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
	
	...

	if (st->state == CPUHP_OFFLINE) { /* 如果 @cpu 处于离线关闭状态 */
		idle = idle_thread_get(cpu);
		...
	}
	
	cpuhp_set_state(st, target); /* 标记 @cpu 为目标状态 @target: st->target = CPUHP_ONLINE */

	target = min((int)target, CPUHP_BRINGUP_CPU);
	ret = cpuhp_up_callbacks(cpu, st, target);
	...
	return ret;
}

/* 设置 CPU 目标状态,返回 CPU 的当前状态 */
static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
	enum cpuhp_state prev_state = st->state;

	st->rollback = false;
	st->last = NULL;

	st->target = target; /* 设置 CPU 目标状态为 @target */
	st->single = false;
	/*
	 * 如果 CPU 的 【目标状态 @target > 当前状态 @st->state】,设为 true ,表示是 CPU 启动正向过程; 
	 * 如果 CPU 的 【目标状态 @target <= 当前状态 @st->state】,设为 false ,表示是 CPU 关闭反向过程。
	 */
	st->bringup = st->state < target;

	return prev_state; /* 返回 CPU 的当前状态 */
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
			      enum cpuhp_state target)
{
	enum cpuhp_state prev_state = st->state;
	int ret = 0;

	/* 逐个调用状态区间 [CPUHP_OFFLINE, CPUHP_BRINGUP_CPU] 所有热插拔状态的回调 */
	while (st->state < target) {
		st->state++;
		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
		...
	}
	return ret;
}

static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
				 bool bringup, struct hlist_node *node,
				 struct hlist_node **lastp)
{
	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
	struct cpuhp_step *step = cpuhp_get_step(state); /* 获取 @state 的回调接口 */

	if (!step->multi_instance) {
		...
		/* 我们只关心和分析相关的状态回调 bringup_cpu() */
		ret = cb(cpu); /* ..., bringup_cpu() */
		...
		return ret;
	}
}

/* Boot processor state steps */
/* 
 * 我们只关注状态 CPUHP_BRINGUP_CPU 的回调,其它的状态回调对我们
 * 的分析没有本质影响。
 */
static struct cpuhp_step cpuhp_bp_states[] = {
	...
#ifdef CONFIG_SMP
	...
	/* Kicks the plugged cpu into life */
	[CPUHP_BRINGUP_CPU] = {
		.name			= "cpu:bringup",
		.startup.single		= bringup_cpu,
		.teardown.single	= NULL,
		.cant_stop		= true,
	},
	...
#else
	...
#endif
};

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
	struct cpuhp_step *sp;

	sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
	return sp + state;
}

static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
	/*
	 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
	 * purposes as that state is handled explicitly in cpu_down.
	 */
	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

看 CPU 热插拔状态 CPUHP_BRINGUP_CPU 回调 bringup_cpu()

/* kernel/cpu.c */
static int bringup_cpu(unsigned int cpu)
{
	struct task_struct *idle = idle_thread_get(cpu);
	int ret;

	...
	/* Arch-specific enabling code. */
	ret = __cpu_up(cpu, idle); /* 进入 CPU 启动架构相关的流程 */
	...

	/* 
	 * 非 BOOT CPU 启动最后,CPU 进入 状态时被唤醒: 
	 * secondary_start_kernel()
	 *		cpu_startup_entry(CPUHP_AP_ONLINE_IDLE)
	 *			cpuhp_online_idle(state)
	 *				st->state = CPUHP_AP_ONLINE_IDLE;
	 *				complete_ap_thread(st, true);
	 *			while (1)
	 *				do_idle();
	 */
	return bringup_wait_for_ap(cpu); /* 等待 CPU 启动完成(进入 CPUHP_AP_ONLINE_IDLE 态) */
}

CPU 启动 ARM 架构相关的流程:

/* arch/arm/kernel/smp.c */
int __cpu_up(unsigned int cpu, struct task_struct *idle)
{
	/* 配置 @cpu 的内核栈空间 */
	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
	...

#ifdef CONFIG_MMU
	/* 配置 @cpu 的页表 */
	secondary_data.pgdir = virt_to_phys(idmap_pgd);
	secondary_data.swapper_pg_dir = get_arch_pgd(swapper_pg_dir);
#endif
	sync_cache_w(&secondary_data); /* cache 同步 */

	ret = smp_ops.smp_boot_secondary(cpu, idle); /* psci_boot_secondary() */
	if (ret == 0) {
		/*
		 * CPU was successfully started, wait for it
		 * to come online or time out.
		 */
		/* 等待 CPU 成功启动:
		 * secondary_start_kernel() -> complete(&cpu_running) 
		 */ 
		wait_for_completion_timeout(&cpu_running,
						 msecs_to_jiffies(1000));
		
		if (!cpu_online(cpu)) { /* CPU 应该已经处于在线状态 */
			pr_crit("CPU%u: failed to come online\n", cpu);
			ret = -EIO;
		}
	} else {
		...
	}

	/* secondary_data 数据是所有非 BOOT CPU 共享的,每个 CPU 启动时都要重新设置 */
	memset(&secondary_data, 0, sizeof(secondary_data));
	return ret;
}

从这里开始,通过 PSCI 接口 psci_boot_secondary() 来启动 CPU :

/* arch/arm/kernel/psci_smp.c */

static int psci_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
	if (psci_ops.cpu_on) /* psci_cpu_on() */
		return psci_ops.cpu_on(cpu_logical_map(cpu),
					virt_to_idmap(&secondary_startup));
	return -ENODEV;
}
/* drivers/firmware/psci.c */

static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
{
	int err;
	u32 fn; /* PSCI 请求功能号 */

	fn = psci_function_id[PSCI_FN_CPU_ON]; /* 启用 CPU 的 PSCI 功能号 */
	// 中转调用请求给实现了 PSCI 接口的固件 ATF 。
	// ATF 处理完 @fn 请求后,会调用 secondary_startup ,
	// 然后返回此处继续执行。
	// 通过 SMC 指令发起 PSCI 功能请求。
	err = invoke_psci_fn(fn, cpuid, entry_point, 0); /* __invoke_psci_fn_smc() */
	return psci_to_linux_errno(err);
}

进入 ATF 固件中的 PSCI 启动 CPU 功能接口 后,该接口会跳转到 secondary_startup() 继续执行:

/* arch/arm/kernel/head.S */

ENTRY(secondary_startup)
#ifdef CONFIG_ARM_VIRT_EXT
	bl	__hyp_stub_install_secondary
#endif
	safe_svcmode_maskall r9

	mrc	p15, 0, r9, c0, c0		@ get processor id
	bl	__lookup_processor_type
	movs	r10, r5				@ invalid processor?
	moveq	r0, #'p'			@ yes, error 'p'
	beq	__error_p

	/*
	 * Use the page tables supplied from  __cpu_up.
	 */
	adr	r4, __secondary_data
	ldmia	r4, {r5, r7, r12}		@ address to jump to after
	sub	lr, r4, r5			@ mmu has been enabled
	add	r3, r7, lr
	ldrd	r4, [r3, #0]			@ get secondary_data.pgdir
ARM_BE8(eor	r4, r4, r5)			@ Swap r5 and r4 in BE:
ARM_BE8(eor	r5, r4, r5)			@ it can be done in 3 steps
ARM_BE8(eor	r4, r4, r5)			@ without using a temp reg.
	ldr	r8, [r3, #8]			@ get secondary_data.swapper_pg_dir
	badr	lr, __enable_mmu		@ return address
	mov	r13, r12			@ __secondary_switched address
	ldr	r12, [r10, #PROCINFO_INITFUNC]
	add	r12, r12, r10			@ initialise processor
						@ (return control reg)
	ret	r12 /* 跳转到 secondary_start_kernel() */
ENDPROC(secondary_startup)

/*
	 * r6  = &secondary_data
	 */
ENTRY(__secondary_switched)
	ldr	sp, [r7, #12]			@ get secondary_data.stack
	mov	fp, #0
	b	secondary_start_kernel
ENDPROC(__secondary_switched)
	...
	
	/*
	 * r6  = &secondary_data
	 */
ENTRY(__secondary_switched)
	ldr	sp, [r7, #12]			@ get secondary_data.stack
	mov	fp, #0
	b	secondary_start_kernel
ENDPROC(__secondary_switched)

	.align

	.type	__secondary_data, %object
__secondary_data:
	.long	.
	.long	secondary_data
	.long	__secondary_switched
/* arch/arm/kernel/smp.c */

asmlinkage void secondary_start_kernel(void)
{
	struct mm_struct *mm = &init_mm;
	unsigned int cpu;

	secondary_biglittle_init();
	
	cpu_switch_mm(mm->pgd, mm);
	local_flush_bp_all();
	enter_lazy_tlb(mm, current);
	local_flush_tlb_all();

	cpu = smp_processor_id();
	mmgrab(mm);
	current->active_mm = mm;
	cpumask_set_cpu(cpu, mm_cpumask(mm));

	cpu_init();

	preempt_disable(); /* 禁用当前CPU抢占 */

	/* 
	 * CPU 热插拔上线前的所有准备工作: 
	 * 触发所有状态 CPUHP_AP_ONLINE 之前的 cpu 热插拔回调
	 * (CPUHP_BRINGUP_CPU + 1 -> CPUHP_AP_ONLINE)
	 */
	notify_cpu_starting(cpu);

	...

	set_cpu_online(cpu, true);

	// __cpu_up() -> wait_for_completion_timeout(&cpu_running, ...)
	complete(&cpu_running);

	/*
	 * OK, it's off to the idle thread for us
	 */
	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* 进入当前 CPU 的idle 进程 */
}
/* kernel/cpu.c */

void notify_cpu_starting(unsigned int cpu)
{
	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
	int ret;

	st->booted_once = true;
	/* 进入 CPU CPUHP_AP_ONLINE 态的所有 @cpu 热插拔状态的工作:
	 * 触发状态区间 [CPUHP_BRINGUP_CPU + 1, CPUHP_AP_ONLINE] 回
	 * 调。
	 */
	while (st->state < target) {
		st->state++;
		ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); /* 如 gic_starting_cpu(), ... */
		/*
		 * STARTING must not fail!
		 */
		WARN_ON_ONCE(ret);
	}
}

CPU 已启动完毕,然后最终进入 idle 状态:

void cpu_startup_entry(enum cpuhp_state state)
{
	...
	cpuhp_online_idle(state);
	while (1)
		do_idle();
}

void cpuhp_online_idle(enum cpuhp_state state)
{
	...
	st->state = CPUHP_AP_ONLINE_IDLE;
	// bringup_cpu() -> bringup_wait_for_ap(cpu)
	complete_ap_thread(st, true);
}

我们可以看到,系统 BOOT 阶段,非 BOOT CPU 是逐个、按严格的先后顺序启动的:只有前一 CPU 进入 idle 循环后,后一个 CPU 的启动工作,才会开始。

3.2 不支持 PSCI 平台的 CPU 启动流程

如果在不支持或没有实现 PSCI 固件功能的 ARMv7 架构平台,各 CPU 的启动流程稍有不同,下面我们以全志 sun8i SoC 为例,来说明 CPU 的启动流程。

3.2.1 BOOT CPU 启动流程

start_kernel()
	...
	setup_arch(&command_line)
		...
		arm_dt_init_cpu_maps()
	#ifdef CONFIG_SMP
		if (is_smp()) {
			if (!mdesc->smp_init || !mdesc->smp_init()) {
				if (psci_smp_available())
					...
				else if (mdesc->smp)
					smp_set_ops(mdesc->smp); /* sun8i_smp_ops */
			}
		}
		...
	#endif
	...
	sched_init()
	...
	rest_init()
		pid = kernel_thread(kernel_init, NULL, CLONE_FS);
		...
		cpu_startup_entry(CPUHP_ONLINE);

3.2.2 非 BOOT CPU 启动流程

kernel_init()
	kernel_init_freeable()
		smp_init()
			cpu_up(cpu)
				do_cpu_up(cpu, CPUHP_ONLINE)
					_cpu_up(cpu, 0, target)
						...
						bringup_cpu()
							__cpu_up(cpu, idle)
								sun8i_smp_boot_secondary()
static int sun8i_smp_boot_secondary(unsigned int cpu,
				    struct task_struct *idle)
{
	u32 reg;

	if (!(prcm_membase && cpucfg_membase))
		return -EFAULT;

	spin_lock(&cpu_lock);

	/* Set CPU boot address */
	/* 设置非 BOOT CPU 的启动地址为 secondary_startup */
	writel(__pa_symbol(secondary_startup),
	       cpucfg_membase + CPUCFG_PRIVATE0_REG);

	/* Assert the CPU core in reset */
	writel(0, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));

	/* Assert the L1 cache in reset */
	reg = readl(cpucfg_membase + CPUCFG_GEN_CTRL_REG);
	writel(reg & ~BIT(cpu), cpucfg_membase + CPUCFG_GEN_CTRL_REG);

	/* Clear CPU power-off gating */
	reg = readl(prcm_membase + PRCM_CPU_PWROFF_REG);
	writel(reg & ~BIT(cpu), prcm_membase + PRCM_CPU_PWROFF_REG);
	mdelay(1);

	/* Deassert the CPU core reset */
	writel(3, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));

	spin_unlock(&cpu_lock);

	return 0;
}

看看 sun8i_smp_boot_secondary() 的逻辑,就是把 非 BOOT CPU 的启动地址设置为 secondary_startup ,即 CPU 启动时从 secondary_startup 开始执行,后面的流程就和 PSCI 一样了:

secondary_startup
	secondary_start_kernel()
		...
		notify_cpu_starting(cpu)
		...
		cpu_startup_entry(CPUHP_AP_ONLINE_IDLE)

3.3 注册 CPU 设备到系统

start_kernel()
	rest_init()
		pid = kernel_thread(kernel_init, NULL, CLONE_FS)

kernel_init()
	kernel_init_freeable()
		smp_init() /* 启动所有 CPU */
		...
		do_basic_setup()
			...
			do_initcalls()
				for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
					do_initcall_level(level)
						for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
							do_one_initcall(*fn)
								ret = fn() /* topology_init() */
/* include/linux/cpu.h */

struct cpu { /* CPU 设备抽象*/
	int node_id;		/* The node which contains the CPU */
	/* CPU 是否可以热插拔: BOOT CPU 不允许热插拔 */
	int hotpluggable;	/* creates sysfs control file if hotpluggable */
	struct device dev;
};

/* arch/arm/include/asm/cpu.h */

struct cpuinfo_arm { /* ARM CPU 设备抽象*/
	struct cpu	cpu;
	u32		cpuid;
#ifdef CONFIG_SMP
	unsigned int	loops_per_jiffy;
#endif
}

/* arch/arm/kernel/setup.c */

DEFINE_PER_CPU(struct cpuinfo_arm, cpu_data); /* ARM 平台每 CPU 的信息数据 */

static int __init topology_init(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
		struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu);
		cpuinfo->cpu.hotpluggable = platform_can_hotplug_cpu(cpu);
		register_cpu(&cpuinfo->cpu, cpu);
	}

	return 0;
}
/* drivers/base/cpu.c */

static DEFINE_PER_CPU(struct device *, cpu_sys_devices);

struct bus_type cpu_subsys = {
	.name = "cpu",
	.dev_name = "cpu",
	.match = cpu_subsys_match,
#ifdef CONFIG_HOTPLUG_CPU
	/* 用来处理 CPU 热插拔。热插拔细节在章节 4 展开 */
	.online = cpu_subsys_online,
	.offline = cpu_subsys_offline,
#endif
};

int register_cpu(struct cpu *cpu, int num)
{
	int error;
	
	cpu->node_id = cpu_to_node(num);
	memset(&cpu->dev, 0x00, sizeof(struct device));
	cpu->dev.id = num; /* Linux CPU 编号 */
	cpu->dev.bus = &cpu_subsys;
	...

	error = device_register(&cpu->dev); /* 注册 CPU 设备到 driver core */
	...

	per_cpu(cpu_sys_devices, num) = &cpu->dev;
	register_cpu_under_node(num, cpu_to_node(num));
	...

	return 0;
}

4. CPU 热插拔管理

4.1 创建 CPU 热插拔管理线程

4.1.1 创建 BOOT CPU 热插拔管理线程

每个 CPU 都有一个热插拔处理线程,前面的流程中,我们没有仔细分析它们,现在来看一下:

kernel_init()
	kernel_init_freeable()
		smp_init()
			...
			cpuhp_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 热插拔管理 内核线程 */
			...
			for_each_present_cpu(cpu) {
				...
				if (!cpu_online(cpu)) /* CPU 尚未启动 */
					cpu_up(cpu); /* 启动 CPU @cpu */
			}
/* kernel/cpu.c */

/* 每 CPU 的热插拔[状态、内核线程等]数据 */
static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
	.fail = CPUHP_INVALID,
};

static struct smp_hotplug_thread cpuhp_threads = {
	/*
	 * store 指向每cpu的热插拔管理数据 cpuhp_state 的 thread: 
	 * smpboot_register_percpu_thread(&cpuhp_threads) 调用过程中,
	 * 会设定到创建的热插拔线程对应的 task_struct 
	 */
	.store			= &cpuhp_state.thread,
	.create			= &cpuhp_create,
	.thread_should_run	= cpuhp_should_run,
	.thread_fn		= cpuhp_thread_fun,
	.thread_comm		= "cpuhp/%u",
	.selfparking		= true,
};

void __init cpuhp_threads_init(void)
{
	BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); /* 为系统中的每个 CPU 都创建一个热插拔处理内核线程 */
	kthread_unpark(this_cpu_read(cpuhp_state.thread)); /* 启动当前 CPU 的热插拔处理内核线程 */
}
/* kernel/smpboot.c */

int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
					   const struct cpumask *cpumask)
{
	unsigned int cpu;
	int ret = 0;

	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
		return -ENOMEM;
	cpumask_copy(plug_thread->cpumask, cpumask); /* 设置所有要创建热插拔管理线程的 CPU 掩码 */

	for_each_online_cpu(cpu) { /* 为当前在线的 CPU 创建内核线程:当前只有 BOOT CPU 在线 */
		/* 创建 @cpu 的热插拔管理内核线程:创建但不启动它 */
		ret = __smpboot_create_thread(plug_thread, cpu);
		...
		if (cpumask_test_cpu(cpu, cpumask)) /* 如果 @cpu 在 @cpumask 中 */
			smpboot_unpark_thread(plug_thread, cpu); /* 则启动 @cpu 的内核线程 */
	}
	list_add(&plug_thread->list, &hotplug_threads);
	...
	return ret;
}

因为当前只有 BOOT CPU 在线,所以只为 BOOT CPU 创建了1个热插拔内核线程。

4.1.2 创建非 BOOT CPU 热插拔管理线程

非 BOOT CPU 启动过程中,在进入 CPUHP_BRINGUP_CPU 状态拉起 CPU 之前,会经过 CPUHP_CREATE_THREADS 状态 ,此时会触发回调 smpboot_create_threads() ,建立当前启动 CPU 的热插拔管理线程:

/* kernel/smpboot.c */

int smpboot_create_threads(unsigned int cpu)
{
	struct smp_hotplug_thread *cur;
	int ret = 0;

	mutex_lock(&smpboot_threads_lock);
	list_for_each_entry(cur, &hotplug_threads, list) {
		/* 创建 @cpu 的各内核线程(包括 @cpu 的热插拔管理内核线程) */
		ret = __smpboot_create_thread(cur, cpu);
		if (ret)
			break;
	}
	mutex_unlock(&smpboot_threads_lock);
	return ret;
}

系统启动后,我们可以查看到各 CPU 热插拔管理内核线程:

root@qemu-ubuntu:~# ps -ef | grep cpuhp | grep -v grep
root        13     2  0 03:00 ?        00:00:00 [cpuhp/0]
root        14     2  0 03:00 ?        00:00:00 [cpuhp/1]
root        20     2  0 03:00 ?        00:00:00 [cpuhp/2]
root        26     2  0 03:00 ?        00:00:00 [cpuhp/3]

4.2 CPU 热插拔过程

本小节给出 CPU 热插拔过程的概述,由于涉及的细节太多,限于篇幅,将不做深入展开。

4.2.1 CPU offline

以一条 shell 命令发起 CPU offline 过程:

# echo 0 > /sys/devices/system/cpu/cpuN/online

这将触发接口 cpu_subsys_offline()

device_offline()
	dev->bus->offline(dev) = cpu_subsys_offline(dev)
		cpu_down(dev->id)
			do_cpu_down(cpu, CPUHP_OFFLINE)
				cpu_down_maps_locked(cpu, target)
					_cpu_down(cpu, 0, target)

BOOT CPU 是不支持 offline 的,我们查看 BOOT CPU 的 sysfs 接口:

root@qemu-ubuntu:~# ls -l /sys/devices/system/cpu/cpu0
total 0
-rw-r--r-- 1 root root 4096 Mar 26 06:04 cpu_capacity
-r-------- 1 root root 4096 Mar 26 06:04 crash_notes
-r-------- 1 root root 4096 Mar 26 06:04 crash_notes_size
drwxr-xr-x 2 root root    0 Mar 26 06:04 hotplug
lrwxrwxrwx 1 root root    0 Mar 26 06:04 of_node -> ../../../../firmware/devicetree/base/cpus/cpu@0
drwxr-xr-x 2 root root    0 Mar 26 06:04 power
lrwxrwxrwx 1 root root    0 Mar 26 06:04 subsystem -> ../../../../bus/cpu
drwxr-xr-x 2 root root    0 Mar 26 06:04 topology
-rw-r--r-- 1 root root 4096 Mar 26 06:04 uevent

我们看到,cpu0 没有 online 属性导出,自然也就不支持 offlineonline 操作。

4.2.2 CPU online

以一条 shell 命令发起 CPU online 过程:

# echo 1 > /sys/devices/system/cpu/cpuN/online

这将触发接口 cpu_subsys_online()

device_online()
	dev->bus->online(dev) = cpu_subsys_online(dev)
		cpu_up(cpuid)
			do_cpu_up(cpu, CPUHP_ONLINE)

4.2.3 CPU 热插拔管理线程的唤醒

从上两小节的描述,我们看不出 CPU 热插拔(offline/online)和其热插拔管理线程有什么关系,我们在这里以 offline 过程为例,展开其细节:

device_offline()
	...
	do_cpu_down(cpu, CPUHP_OFFLINE)
		...
		_cpu_down(cpu, 0, target)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
			   enum cpuhp_state target)
{
	...
	if (st->state > CPUHP_TEARDOWN_CPU) {
		st->target = max((int)target, CPUHP_TEARDOWN_CPU);
		/* 唤醒 CPU 热插拔管理线程,处理 CPU offline 过程前期部分 */
		ret = cpuhp_kick_ap_work(cpu); 
		...

		if (st->state > CPUHP_TEARDOWN_CPU)
			goto out;
		
		st->target = target;
	}

	/* 处理剩余的 CPU offline 过程: 调用状态区间 [..., CPUHP_TEARDOWN_CPU] 各回调 */ 
	ret = cpuhp_down_callbacks(cpu, st, target);
	...

out:
	...
	return ret;
}

随着 CPU 的热插拔管理线程被唤醒,将处理 CPU offline 过程前期部分:

/* kernel/smpboot.c */

static int smpboot_thread_fn(void *data)
{
	struct smpboot_thread_data *td = data;
	struct smp_hotplug_thread *ht = td->ht;

	while (1) {
		...
		
		if (!ht->thread_should_run(td->cpu)) {
			...
		} else {
			__set_current_state(TASK_RUNNING);
			preempt_enable();

			ht->thread_fn(td->cpu); /* cpuhp_thread_fun() */
		}
	}
}
/* kernel/cpu.c */

static void cpuhp_thread_fun(unsigned int cpu)
{
	...

	if (st->single) {
		...
	} else {
		if (bringup) {
			...
		} else {
			state = st->state;
			st->state--; /* 更新状态 */
			st->should_run = (st->state > st->target);
		}
	}

	...
	/* CPU offline 状态回调:每次调用 1 个状态回调,直到达到目标状态 @st->target 为止 */
	st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);

	...
}

CPU online 过程中,唤醒热插拔管理线程的过程类似,在此不再赘述,感兴趣的童鞋可自行阅读代码分析。

5. 参考资料

《DEN0013D_cortex_a_series_PG.pdf》
《learn_the_architecture_-_trustzone_for_aarch64_102418_0101_01_en.pdf》
https://www.kernel.org/doc/Documentation/arm/Booting
https://lwn.net/Articles/557132/
https://github.com/ARM-software/arm-trusted-firmware

你可能感兴趣的:(#,Linux基础,linux,多核CPU启动)