Linux : task work 机制

task work机制可以在内核中向指定的进程添加一些任务函数,这些任务函数会在进程返回用户态时执行,使用的是该进程的上下文。包括下面的这些API:

  • task_work_add
  • task_work_cancel
  • task_work_run

进程对象task_struct中有个字段用来存储这些待进行的任务列表头即task_works,这个结构体包含一个next指针和需要执行的函数指针。

205 /**

206  * struct callback_head - callback structure for use with RCU and task_work

207  * @next: next update requests in a list

208  * @func: actual update function to call after the grace period.

209  */

210 struct callback_head {

211         struct callback_head *next;

212         void (*func)(struct callback_head *head);

213 };
  4 

  5 static struct callback_head work_exited; /* all we need is ->next == NULL */

  6 

  7 /**

  8  * task_work_add - ask the @task to execute @work->func()

  9  * @task: the task which should run the callback

 10  * @work: the callback to run

 11  * @notify: send the notification if true

 12  *

 13  * Queue @work for task_work_run() below and notify the @task if @notify.

 14  * Fails if the @task is exiting/exited and thus it can't process this @work.

 15  * Otherwise @work->func() will be called when the @task returns from kernel

 16  * mode or exits.

 17  *

 18  * This is like the signal handler which runs in kernel mode, but it doesn't

 19  * try to wake up the @task.

 20  *

 21  * RETURNS:

 22  * 0 if succeeds or -ESRCH.

 23  */

 24 int

 25 task_work_add(struct task_struct *task, struct callback_head *work, bool notify)

 26 {

 27         struct callback_head *head;

 28 

 29         do {

 30                 head = ACCESS_ONCE(task->task_works);

 31                 if (unlikely(head == &work_exited))

 32                         return -ESRCH;

 33                 work->next = head;

 34         } while (cmpxchg(&task->task_works, head, work) != head);

 35 

 36         if (notify)

 37                 set_notify_resume(task);

 38         return 0;

 39 }

主要工作:

1. 通过CAS以无锁的形式添加了一个链表元素。(新元素排在原有链表头部)

2. set_notify_resume函数向指定的进程设置了一个_TIF_NOTIFY_RESUME标记。

task_work_run执行时机

在返回用户态之前会对当前进程的标记检查,如果相关标记置位则会调用do_notify_resume

595 int_signal:

596         testl $_TIF_DO_NOTIFY_MASK,%edx

597         jz 1f

598         movq %rsp,%rdi          # &ptregs -> arg1

599         xorl %esi,%esi          # oldset -> arg2

600         call do_notify_resume

601 1:      movl $_TIF_WORK_MASK,%edi

602 int_restore_rest:

603         RESTORE_REST

604         DISABLE_INTERRUPTS(CLBR_NONE)

605         TRACE_IRQS_OFF

606         jmp int_with_check

607         CFI_ENDPROC

608 END(system_call)

以上文件为entry_64.S,而标记定义在thread_info.c中

130 /* work to do on interrupt/exception return */

131 #define _TIF_WORK_MASK                                                  \

132         (0x0000FFFF &                                                   \

133          ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|                       \

134            _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
 70 #define TIF_SYSCALL_TRACE       0       /* syscall trace active */

 71 #define TIF_NOTIFY_RESUME       1       /* callback before returning to user */

 72 #define TIF_SIGPENDING          2       /* signal pending */

 73 #define TIF_NEED_RESCHED        3       /* rescheduling necessary */

 74 #define TIF_SINGLESTEP          4       /* reenable singlestep on user return*/

 75 #define TIF_SYSCALL_EMU         6       /* syscall emulation active */

 76 #define TIF_SYSCALL_AUDIT       7       /* syscall auditing active */

 77 #define TIF_SECCOMP             8       /* secure computing */

 78 #define TIF_MCE_NOTIFY          10      /* notify userspace of an MCE */

 79 #define TIF_USER_RETURN_NOTIFY  11      /* notify kernel of userspace return */

 80 #define TIF_UPROBE              12      /* breakpointed or singlestepping */

 81 #define TIF_NOTSC               16      /* TSC is not accessible in userland */

 82 #define TIF_IA32                17      /* IA32 compatibility process */

 83 #define TIF_FORK                18      /* ret_from_fork */

 84 #define TIF_NOHZ                19      /* in adaptive nohz mode */

 85 #define TIF_MEMDIE              20      /* is terminating due to OOM killer */

 86 #define TIF_POLLING_NRFLAG      21      /* idle is polling for TIF_NEED_RESCHED */

 87 #define TIF_IO_BITMAP           22      /* uses I/O bitmap */

 88 #define TIF_FORCED_TF           24      /* true if TF in eflags artificially */

 89 #define TIF_BLOCKSTEP           25      /* set when we want DEBUGCTLMSR_BTF */

 90 #define TIF_LAZY_MMU_UPDATES    27      /* task is updating the mmu lazily */

 91 #define TIF_SYSCALL_TRACEPOINT  28      /* syscall tracepoint instrumentation */

 92 #define TIF_ADDR32              29      /* 32-bit address space on 64 bits */

 93 #define TIF_X32                 30      /* 32-bit native x86-64 binary */

 94 

即_TIF_WORK_MASK表示除开(_TIF_SYSCALL_TRACE, _TIF_SYSCALL_AUDIT, _TIF_SINGLESTEP, _TIF_SECCOMP, _TIF_SYSCALL_EMU)之外的所有标记。自然包括了_TIF_NOTIFY_RESUME标记。

do_notify_resume函数

729 /*

730  * notification of userspace execution resumption

731  * - triggered by the TIF_WORK_MASK flags

732  */

733 __visible void

734 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)

735 {

736         user_exit();

737 

738 #ifdef CONFIG_X86_MCE

739         /* notify userspace of pending MCEs */

740         if (thread_info_flags & _TIF_MCE_NOTIFY)

741                 mce_notify_process();

742 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */

743 

744         if (thread_info_flags & _TIF_UPROBE)

745                 uprobe_notify_resume(regs);

746 

747         /* deal with pending signal delivery */

748         if (thread_info_flags & _TIF_SIGPENDING)

749                 do_signal(regs);

750 

751         if (thread_info_flags & _TIF_NOTIFY_RESUME) {

752                 clear_thread_flag(TIF_NOTIFY_RESUME);

753                 tracehook_notify_resume(regs);

754         }

755         if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)

756                 fire_user_return_notifiers();

757 

758         user_enter();

759 }

可以看到在其中调用tracehook_notify_resume函数,也包括其他一些如信号处理相关的函数。

tracehook_notify_resume

174 /**

175  * tracehook_notify_resume - report when about to return to user mode

176  * @regs: user-mode registers of @current task

177  *

178  * This is called when %TIF_NOTIFY_RESUME has been set. Now we are

179  * about to return to user mode, and the user state in @regs can be

180  * inspected or adjusted. The caller in arch code has cleared

181  * %TIF_NOTIFY_RESUME before the call. If the flag gets set again

182  * asynchronously, this will be called again before we return to

183  * user mode.

184  *

185  * Called without locks.

186  */

187 static inline void tracehook_notify_resume(struct pt_regs *regs)

188 {

189         /*

190  * The caller just cleared TIF_NOTIFY_RESUME. This barrier

191  * pairs with task_work_add()->set_notify_resume() after

192  * hlist_add_head(task->task_works);

193  */

194         smp_mb__after_atomic();

195         if (unlikely(current->task_works))

196                 task_work_run();

197 }

在进程对象的task_works不为null的情况下才有任务需要执行。

task_work_run

 77 /**

 78  * task_work_run - execute the works added by task_work_add()

 79  *

 80  * Flush the pending works. Should be used by the core kernel code.

 81  * Called before the task returns to the user-mode or stops, or when

 82  * it exits. In the latter case task_work_add() can no longer add the

 83  * new work after task_work_run() returns.

 84  */

 85 void task_work_run(void)

 86 {

 87         struct task_struct *task = current;

 88         struct callback_head *work, *head, *next;

 89 

 90         for (;;) {

 91                 /*

 92  * work->func() can do task_work_add(), do not set

 93  * work_exited unless the list is empty.

 94  */

 95                 do {

 96                         work = ACCESS_ONCE(task->task_works);

 97                         head = !work && (task->flags & PF_EXITING) ?

 98                                 &work_exited : NULL;

 99                 } while (cmpxchg(&task->task_works, work, head) != work);

100 

101                 if (!work)

102                         break;

103                 /*

104  * Synchronize with task_work_cancel(). It can't remove

105  * the first entry == work, cmpxchg(task_works) should

106  * fail, but it can play with *work and other entries.

107  */

108                 raw_spin_unlock_wait(&task->pi_lock);

109                 smp_mb();

110 

111                 /* Reverse the list to run the works in fifo order */

112                 head = NULL;

113                 do {

114                         next = work->next;

115                         work->next = head;

116                         head = work;

117                         work = next;

118                 } while (work);

119 

120                 work = head;

121                 do {

122                         next = work->next;

123                         work->func(work);

124                         work = next;

125                         cond_resched();

126                 } while (work);

127         }

128 }

1. 通过CAS,以无锁的方式取得task_works链表

2. 因为原链表是按元素添加到链表的时间逆序排列的(见task_work_add),先把链表反转一遍

3. 反转链表后,遍历链表,执行各个元素的任务函数即work->func(work) 

你可能感兴趣的:(linux)