Linux 进程管理

本篇介绍

本篇介绍下Linux 中进程管理相关的内容，包括进程状态，切换等。

内容介绍

在内核层面，每个进程都是由task_struct 描述的，这个结构体非常大，可以粗略看下各主要内容：

代码语言：javascript复制

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /*
     * For reasons of header soup (see current_thread_info()), this
     * must be the first element of task_struct.
     */
    struct thread_info      thread_info;
#endif
    unsigned int            __state;

#ifdef CONFIG_PREEMPT_RT
    /* saved state for "spinlock sleepers" */
    unsigned int            saved_state;
#endif
    /*
     * This begins the randomizable portion of task_struct. Only
     * scheduling-critical items should be added above here.
     */
    randomized_struct_fields_start

    void                *stack;
    refcount_t          usage;

// 调度相关
    int             on_rq;

    int             prio;
    int             static_prio;
    int             normal_prio;
    unsigned int            rt_priority;

    struct sched_entity     se;
    struct sched_rt_entity      rt;
    struct sched_dl_entity      dl;
    const struct sched_class    *sched_class;
// 内存相关
    struct mm_struct        *mm;
    struct mm_struct        *active_mm;
//  进程组织相关
pid_t               pid;
    pid_t               tgid;

#ifdef CONFIG_STACKPROTECTOR
    /* Canary value for the -fstack-protector GCC feature: */
    unsigned long           stack_canary;
#endif
    /*
     * Pointers to the (original) parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with
     * p->real_parent->pid)
     */

    /* Real parent process: */
    struct task_struct __rcu    *real_parent;

    /* Recipient of SIGCHLD, wait4() reports: */
    struct task_struct __rcu    *parent;

    /*
     * Children/sibling form the list of natural children:
     */
    struct list_head        children;
    struct list_head        sibling;
    struct task_struct      *group_leader;
// 文件相关
    /* Filesystem information: */
    struct fs_struct        *fs;

    /* Open file information: */
    struct files_struct     *files;
// 信号相关
    /* Signal handlers: */
    struct signal_struct        *signal;
    struct sighand_struct __rcu     *sighand;
    sigset_t            blocked;
    sigset_t            real_blocked;
    /* Restored if set_restore_sigmask() was used: */
    sigset_t            saved_sigmask;
    struct sigpending       pending;

内核线程

内核线程其实就是运行在内核态的进程，只是没有进程地址空间，因此只能运行在内核地址空间中，创建方法如下：

代码语言：javascript复制

/**
 * kthread_create - create a kthread on the current node
 * @threadfn: the function to run in the thread
 * @data: data pointer for @threadfn()
 * @namefmt: printf-style format string for the thread name
 * @arg: arguments for @namefmt.
 *
 * This macro will create a kthread on the current node, leaving it in
 * the stopped state.  This is just a helper for kthread_create_on_node();
 * see the documentation there for more details.
 */
#define kthread_create(threadfn, data, namefmt, arg...) 
    kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)

/**
 * kthread_run - create and wake a thread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @namefmt: printf-style name for the thread.
 *
 * Description: Convenient wrapper for kthread_create() followed by
 * wake_up_process().  Returns the kthread or ERR_PTR(-ENOMEM).
 */
#define kthread_run(threadfn, data, namefmt, ...)              
({                                     
    struct task_struct *__k                        
        = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); 
    if (!IS_ERR(__k))                          
        wake_up_process(__k);                      
    __k;                                   
})

进程创建

用户态可以通过fork，vfork，clone来创建新的进程，在内核中的实现统一都是do_fork, do_fork内部的实现又是copy_process，大致流程如下:

代码语言：javascript复制

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
                    struct pid *pid,
                    int trace,
                    int node,
                    struct kernel_clone_args *args) {
...
/*
     * Force any signals received before this point to be delivered
     * before the fork happens.  Collect up signals sent to multiple
     * processes that happen during the fork and delay them so that
     * they appear to happen after the fork.
     */
    sigemptyset(&delayed.signal);
    INIT_HLIST_NODE(&delayed.node);

    spin_lock_irq(&current->sighand->siglock);
    if (!(clone_flags & CLONE_THREAD))
        hlist_add_head(&delayed.node, &current->signal->multiprocess);
    recalc_sigpending();
    spin_unlock_irq(&current->sighand->siglock);
    retval = -ERESTARTNOINTR;
    if (task_sigpending(current))
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current, node);
    if (!p)
        goto fork_out;
    p->flags &= ~PF_KTHREAD;
    if (args->kthread)
        p->flags |= PF_KTHREAD;
    if (args->io_thread) {
        /*
         * Mark us an IO worker, and block any signal that isn't
         * fatal or STOP
         */
        p->flags |= PF_IO_WORKER;
        siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
    }
...
    /* Perform scheduler related setup. Assign this task to a CPU. */
    retval = sched_fork(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_perf;
    /* copy all the process information */
    shm_init_task(p);
    retval = security_task_alloc(p, clone_flags);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_security;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(p, args);
    if (retval)
        goto bad_fork_cleanup_io;

    stackleak_task_init(p);

    if (pid != &init_struct_pid) {
        pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                args->set_tid_size);
        if (IS_ERR(pid)) {
            retval = PTR_ERR(pid);
            goto bad_fork_cleanup_thread;
        }
    }

    /*
     * This has to happen after we've potentially unshared the file
     * descriptor table (so that the pidfd doesn't leak into the child
     * if the fd table isn't shared).
     */
    if (clone_flags & CLONE_PIDFD) {
        retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (retval < 0)
            goto bad_fork_free_pid;

        pidfd = retval;

        pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
                          O_RDWR | O_CLOEXEC);
        if (IS_ERR(pidfile)) {
            put_unused_fd(pidfd);
            retval = PTR_ERR(pidfile);
            goto bad_fork_free_pid;
        }
        get_pid(pid);   /* held by pidfile now */

        retval = put_user(pidfd, args->pidfd);
        if (retval)
            goto bad_fork_put_pidfd;
    }

#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
    futex_init_task(p);

    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        sas_ss_reset(p);

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
    clear_task_syscall_work(p, SYSCALL_EMU);
#endif
    clear_tsk_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;
    clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
    p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
    p->rethooks.first = NULL;
#endif

    /*
     * Ensure that the cgroup subsystem policies allow the new process to be
     * forked. It should be noted that the new process's css_set can be changed
     * between here and cgroup_post_fork() if an organisation operation is in
     * progress.
     */
    retval = cgroup_can_fork(p, args);
    if (retval)
        goto bad_fork_put_pidfd;

    /*
     * Now that the cgroups are pinned, re-clone the parent cgroup and put
     * the new task on the correct runqueue. All this *before* the task
     * becomes visible.
     *
     * This isn't part of ->can_fork() because while the re-cloning is
     * cgroup specific, it unconditionally needs to place the task on a
     * runqueue.
     */
    sched_cgroup_fork(p, args);

    /*
     * From this point on we must avoid any synchronous user-space
     * communication until we take the tasklist-lock. In particular, we do
     * not want user-space to be able to predict the process start-time by
     * stalling fork(2) after we recorded the start_time but before it is
     * visible to the system.
     */

    p->start_time = ktime_get_ns();
    p->start_boottime = ktime_get_boottime_ns();

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
        if (clone_flags & CLONE_THREAD)
            p->exit_signal = -1;
        else
            p->exit_signal = current->group_leader->exit_signal;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
        p->exit_signal = args->exit_signal;
    }

    klp_copy_process(p);

    sched_core_fork(p);

    spin_lock(&current->sighand->siglock);

    rv_task_fork(p);

    rseq_fork(p, clone_flags);

    /* Don't start children in a dying pid namespace */
    if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
        retval = -ENOMEM;
        goto bad_fork_cancel_cgroup;
    }

    /* Let kill terminate clone/fork in the middle */
    if (fatal_signal_pending(current)) {
        retval = -EINTR;
        goto bad_fork_cancel_cgroup;
    }

    /* No more failure paths after this point. */

    /*
     * Copy seccomp details explicitly here, in case they were changed
     * before holding sighand lock.
     */
    copy_seccomp(p);

    init_task_pid_links(p);
    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_TGID, pid);
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }
            p->signal->shared_pending.signal = delayed.signal;
            p->signal->tty = tty_kref_get(current->signal->tty);
            /*
             * Inherit has_child_subreaper flag under the same
             * tasklist_lock with adding child to the process tree
             * for propagate_has_child_subreaper optimization.
             */
            p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                             p->real_parent->signal->is_child_subreaper;
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_TGID);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {
            current->signal->nr_threads  ;
            current->signal->quick_threads  ;
            atomic_inc(&current->signal->live);
            refcount_inc(&current->signal->sigcnt);
            task_join_group_stop(p);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads  ;
    }
    total_forks  ;
    hlist_del_init(&delayed.node);
    spin_unlock(&current->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    if (pidfile)
        fd_install(pidfd, pidfile);

    proc_fork_connector(p);
    sched_post_fork(p);
    cgroup_post_fork(p, args);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    copy_oom_score_adj(clone_flags, p);
...
}

进程调度

nice值: 范围是-20 ~ 19, 值越大，优先级越低，也就是对其他进程越‘’nice‘’。在内核的task_struct结构中有几个字段是涉及到优先级:

代码语言：javascript复制

    int             prio; // 进程的动态优先级，也是调度类考虑使用的优先级
    int             static_prio; // 静态优先级，进程启动时分配，可以通过nice或sched_setscheduler 等系统调用修改
    int             normal_prio;// 根据static_prio和调度策略计算出来的优先级
    unsigned int            rt_priority;// 实时进程的优先级

调度策略

目前Linux内核中默认实现了5个调度类，分别如下： stop：最高优先级，可以抢占任何进程 deadline：调度策略是SCHED_DEADLINE，用于调度有严格时间要求的实时进程 realtime：SCHED_FILO,SCHED_RR, 用于普通的实时进程 CFS：SCHED_NORMAL，SCHED_BATCH，SCHED_IDEL，普通进程，由CFS来调度 idel：用于最低优先级的进程

在POSIX中也有对应的api可以设置调度策略及优先级：

代码语言：javascript复制

int sched_setscheduler(pid_t pid, int policy, const struct sched_param *param)
int sched_getscheduler(pid_t pid)
int sehed_setparam(pid_t pid, const struct sched_param *param)
int sched_getparam(pid_t pid, struct sched_param *param)

CFS算法

谈到进程调度首先能想到的方法就是时间片等经典算法，实际上当前主流的调度算法是CFS，CFS中引入了虚拟时间和真实时间的概念，在调度的时候选择运行虚拟时间最短的进程，可以简单理解成虚拟时间等于真实时间与权重的比值，这儿的权重是以nice 0权重作为基准，如果nice值越小，权重大，这时候虚拟时间就小于真实时间，那么运行时间就变长，如果nice值大这时候权重值就会变小，虚拟时间就会大于时间时间，运行时间就会变短。接下来详细看看： nice值到权重值的转换如下：

代码语言：javascript复制

/*
 * Nice levels are multiplicative, with a gentle 10% change for every
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 * nice 1, it will get ~10% less CPU time than another CPU-bound task
 * that remained on nice 0.
 *
 * The "10% effect" is relative and cumulative: from _any_ nice level,
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 * it's  10% CPU usage. (to achieve that we use a multiplier of 1.25.
 * If a task goes up by ~10% and another task goes down by ~10% then
 * the relative distance between them is ~25%.)
 */
const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

权重值是0对应的是权重是1024，相邻nice值的权重值差异是1.25，也就是nice值减1，CPU时间会增加10%。再看下另外一个表：

代码语言：javascript复制

/*
 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
 *
 * In cases where the weight does not change often, we can use the
 * precalculated inverse to speed up arithmetics by turning divisions
 * into multiplications:
 */
const u32 sched_prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

这个表可以看成是sched_prio_to_weight的倒数，为啥又需要倒数呢？这就需要看看虚拟时间的计算方法了：

代码语言：javascript复制

struct load_weight {
    unsigned long           weight; // 权重
    u32             inv_weight; // 权重的倒数 inv_weight=2^32/weight
};

struct sched_entity {
    /* For load-balancing: */
    struct load_weight      load;
    struct rb_node          run_node;
    struct list_head        group_node;
    unsigned int            on_rq;
...
}

static void set_load_weight(struct task_struct *p, bool update_load)
{
    int prio = p->static_prio - MAX_RT_PRIO;
    struct load_weight *load = &p->se.load;

    /*
     * SCHED_IDLE tasks get minimal weight:
     */
    if (task_has_idle_policy(p)) {
        load->weight = scale_load(WEIGHT_IDLEPRIO);
        load->inv_weight = WMULT_IDLEPRIO;
        return;
    }

    /*
     * SCHED_OTHER tasks have to update their load when changing their
     * weight
     */
    if (update_load && p->sched_class == &fair_sched_class) {
        reweight_task(p, prio);
    } else {
        load->weight = scale_load(sched_prio_to_weight[prio]);
        load->inv_weight = sched_prio_to_wmult[prio];
    }
}

还有一个公式： vruntime=delta_exec*nice_0_weight/weight delta_exec表示真实运行时间看到这个公式就可以看到sched_prio_to_wmult的意义了，就是为了加快计算。调度类也有面向对象的里面，如果要实现一个调度器，实现调度需要的接口方法就好了，如下所示：

代码语言：javascript复制

struct sched_class {

#ifdef CONFIG_UCLAMP_TASK
    int uclamp_enabled;
#endif

    void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
    void (*yield_task)   (struct rq *rq);
    bool (*yield_to_task)(struct rq *rq, struct task_struct *p);

    void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);

    struct task_struct *(*pick_next_task)(struct rq *rq);

    void (*put_prev_task)(struct rq *rq, struct task_struct *p);
    void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);

#ifdef CONFIG_SMP
    int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
    int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);

    struct task_struct * (*pick_task)(struct rq *rq);

    void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

    void (*task_woken)(struct rq *this_rq, struct task_struct *task);

    void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);

    void (*rq_online)(struct rq *rq);
    void (*rq_offline)(struct rq *rq);

    struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif

    void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
    void (*task_fork)(struct task_struct *p);
    void (*task_dead)(struct task_struct *p);

    /*
     * The switched_from() call is allowed to drop rq->lock, therefore we
     * cannot assume the switched_from/switched_to pair is serialized by
     * rq->lock. They are however serialized by p->pi_lock.
     */
    void (*switched_from)(struct rq *this_rq, struct task_struct *task);
    void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
    void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                  int oldprio);

    unsigned int (*get_rr_interval)(struct rq *rq,
                    struct task_struct *task);

    void (*update_curr)(struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
    void (*task_change_group)(struct task_struct *p);
#endif
};

进程调度

进程调度的API如下：

代码语言：javascript复制

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);

schedule做的主要事情是选择下一个进程，然后进行上下文切换。

代码语言：javascript复制

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
{
    prepare_task_switch(rq, prev, next);

    /*
     * For paravirt, this is coupled with an exit in switch_to to
     * combine the page table reload and the switch backend into
     * one hypercall.
     */
    arch_start_context_switch(prev);

    /*
     * kernel -> kernel   lazy   transfer active
     *   user -> kernel   lazy   mmgrab() active
     *
     * kernel ->   user   switch   mmdrop() active
     *   user ->   user   switch
     */
    if (!next->mm) {                                // to kernel
        enter_lazy_tlb(prev->active_mm, next);

        next->active_mm = prev->active_mm;
        if (prev->mm)                           // from user
            mmgrab(prev->active_mm);
        else
            prev->active_mm = NULL;
    } else {                                        // to user
        membarrier_switch_mm(rq, prev->active_mm, next->mm);
        /*
         * sys_membarrier() requires an smp_mb() between setting
         * rq->curr / membarrier_switch_mm() and returning to userspace.
         *
         * The below provides this either through switch_mm(), or in
         * case 'prev->active_mm == next->mm' through
         * finish_task_switch()'s mmdrop().
         */
        switch_mm_irqs_off(prev->active_mm, next->mm, next);
        lru_gen_use_mm(next->mm);

        if (!prev->mm) {                        // from kernel
            /* will mmdrop() in finish_task_switch(). */
            rq->prev_mm = prev->active_mm;
            prev->active_mm = NULL;
        }
    }

    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

    prepare_lock_switch(rq, next, rf);

    /* Here we just switch the register state and the stack. */
    switch_to(prev, next, prev);
    barrier();

    return finish_task_switch(prev);
}

也就是将新进程的页表基地址加载到页表基地址寄存器中，也负责切换硬件上下文，比如刷新TLB。

简单提下TLB, 用于作为线性地址换成物理地址的cache，毕竟每次通过mmu转换地址相比起直接获取到物理地址还是多出不少操作。

接下来看下switch_to方法：

代码语言：javascript复制

/*
 * For newly created kernel threads switch_to() will return to
 * ret_from_kernel_thread, newly created user threads to ret_from_fork.
 * That is, everything following resume() will be skipped for new threads.
 * So everything that matters to new threads should be placed before resume().
 */
#define switch_to(prev, next, last)                 
do {                                    
    __mips_mt_fpaff_switch_to(prev);                
    lose_fpu_inatomic(1, prev);                 
    if (tsk_used_math(next))                    
        __sanitize_fcr31(next);                 
    if (cpu_has_dsp) {                      
        __save_dsp(prev);                   
        __restore_dsp(next);                    
    }                               
    if (cop2_present) {                     
        u32 status = read_c0_status();              
                                    
        set_c0_status(ST0_CU2);                 
        if ((KSTK_STATUS(prev) & ST0_CU2)) {            
            if (cop2_lazy_restore)              
                KSTK_STATUS(prev) &= ~ST0_CU2;      
            cop2_save(prev);                
        }                           
        if (KSTK_STATUS(next) & ST0_CU2 &&          
            !cop2_lazy_restore) {               
            cop2_restore(next);             
        }                           
        write_c0_status(status);                
    }                               
    __clear_r5_hw_ll_bit();                     
    __clear_software_ll_bit();                  
    if (cpu_has_userlocal)                      
        write_c0_userlocal(task_thread_info(next)->tp_value);   
    __restore_watch(next);                      
    (last) = resume(prev, next, task_thread_info(next));        
} while (0)

这是负责栈空间切换的，看到这儿会有2个疑问：

切换似乎只需要两个参数就够了，一个表示当前进程，一个表示下一个进程，也就是prev，next，为什么还需要last作为第三个参数呢？
context_switch在执行完switch_to后还有一段代码，是会被谁来执行呢？比如finish_task_switch

能搞明白这两个问题基本进程切换就清晰了，我们慢慢捋一下，比如现在有A，B两个进程，A表示当前的进程，现在A要进行切换了，选择的下一个进程是B，那么就开始执行switch_to, 此时prev是A，next是B，执行完后CPU就会切换B的进程上下文执行，这时候switch_to之后的代码就不会被执行了，因为CPU进行加载了B的硬件上下文，寄存器中已经都是进程B的上下文信息了，就执行进程B的指令了，那switch_to后的代码啥时候会被执行呢？比如说过了一会儿，另外一个CPU上的某个进程X也准备切换了，然后选择的目标进程是A，开始执行swtich_to，那这时候prev是X，next是A，执行完后就会切换到进程A的上下文中，这时候CPU加载进A的硬件上下文，而原先swtich_to还没执行的指令地址就在进程A的上下文中保存着，接下来就会在进程A中执行swtich_to后的代码，在执行A的指令前需要帮prev进程做一个清理操作，这时候就是prev的用处了，也就是swtich_to之所以需要第三个参数，是因为需要知道切换到当前进程的前一个进程信息，而前一个进程又不一定是当前切换的目标进程，因此就需要用第三个参数传递。接下来再看下finish_task_switch究竟清理什么内容了：

代码语言：javascript复制

/**
 * finish_task_switch - clean up after a task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
 * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 *
 * The context switch have flipped the stack from under us and restored the
 * local variables which were saved when this task called schedule() in the
 * past. prev == current is still correct but we need to recalculate this_rq
 * because prev may have moved to another CPU.
 */
static struct rq *finish_task_switch(struct task_struct *prev)
    __releases(rq->lock)
{
    struct rq *rq = this_rq();
    struct mm_struct *mm = rq->prev_mm;
    unsigned int prev_state;

    /*
     * The previous task will have left us with a preempt_count of 2
     * because it left us after:
     *
     *  schedule()
     *    preempt_disable();            // 1
     *    __schedule()
     *      raw_spin_lock_irq(&rq->lock)    // 2
     *
     * Also, see FORK_PREEMPT_COUNT.
     */
    if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
              "corrupted preempt_count: %s/%d/0x%xn",
              current->comm, current->pid, preempt_count()))
        preempt_count_set(FORK_PREEMPT_COUNT);

    rq->prev_mm = NULL;

    /*
     * A task struct has one reference for the use as "current".
     * If a task dies, then it sets TASK_DEAD in tsk->state and calls
     * schedule one last time. The schedule call will never return, and
     * the scheduled task must drop that reference.
     *
     * We must observe prev->state before clearing prev->on_cpu (in
     * finish_task), otherwise a concurrent wakeup can get prev
     * running on another CPU and we could rave with its RUNNING -> DEAD
     * transition, resulting in a double drop.
     */
    prev_state = READ_ONCE(prev->__state);
    vtime_task_switch(prev);
    perf_event_task_sched_in(prev, current);
    finish_task(prev);
    tick_nohz_task_switch();
    finish_lock_switch(rq);
    finish_arch_post_lock_switch();
    kcov_finish_switch(current);
    /*
     * kmap_local_sched_out() is invoked with rq::lock held and
     * interrupts disabled. There is no requirement for that, but the
     * sched out code does not have an interrupt enabled section.
     * Restoring the maps on sched in does not require interrupts being
     * disabled either.
     */
    kmap_local_sched_in();

    fire_sched_in_preempt_notifiers(current);
    /*
     * When switching through a kernel thread, the loop in
     * membarrier_{private,global}_expedited() may have observed that
     * kernel thread and not issued an IPI. It is therefore possible to
     * schedule between user->kernel->user threads without passing though
     * switch_mm(). Membarrier requires a barrier after storing to
     * rq->curr, before returning to userspace, so provide them here:
     *
     * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
     *   provided by mmdrop(),
     * - a sync_core for SYNC_CORE.
     */
    if (mm) {
        membarrier_mm_sync_core_before_usermode(mm);
        mmdrop_sched(mm);
    }
    if (unlikely(prev_state == TASK_DEAD)) {
        if (prev->sched_class->task_dead)
            prev->sched_class->task_dead(prev);

        /* Task is done with its stack. */
        put_task_stack(prev);

        put_task_struct_rcu_user(prev);
    }

    return rq;
}

这儿再抛一个问题，进程和线程在内核态的表示是否都是task_struct? 答案是是的，曾经在内核中看到一个结构体thread_struct,有段时间以为进程是task_struct, 线程是thread_struct，后来随着阅历的丰富，也渐渐清晰起来了，进程和线程都是task_struct, 无非是线程的mm是共享的。那什么是thread_struct呢？接下来看下：

代码语言：javascript复制

struct thread_struct {
    struct cpu_context  cpu_context;    /* cpu context */

    /*
     * Whitelisted fields for hardened usercopy:
     * Maintainers must ensure manually that this contains no
     * implicit padding.
     */
    struct {
        unsigned long   tp_value;   /* TLS register */
        unsigned long   tp2_value;
        struct user_fpsimd_state fpsimd_state;
    } uw;

    enum fp_type        fp_type;    /* registers FPSIMD or SVE? */
    unsigned int        fpsimd_cpu;
    void            *sve_state; /* SVE registers, if any */
    void            *za_state;  /* ZA register, if any */
    unsigned int        vl[ARM64_VEC_MAX];  /* vector length */
    unsigned int        vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */
    unsigned long       fault_address;  /* fault info */
    unsigned long       fault_code; /* ESR_EL1 value */
    struct debug_info   debug;      /* debugging */
#ifdef CONFIG_ARM64_PTR_AUTH
    struct ptrauth_keys_user    keys_user;
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
    struct ptrauth_keys_kernel  keys_kernel;
#endif
#endif
#ifdef CONFIG_ARM64_MTE
    u64         mte_ctrl;
#endif
    u64         sctlr_user;
    u64         svcr;
    u64         tpidr2_el0;
};

cpu_context内容如下：

代码语言：javascript复制

struct cpu_context {
    unsigned long x19;
    unsigned long x20;
    unsigned long x21;
    unsigned long x22;
    unsigned long x23;
    unsigned long x24;
    unsigned long x25;
    unsigned long x26;
    unsigned long x27;
    unsigned long x28;
    unsigned long fp;
    unsigned long sp;
    unsigned long pc;
};

这个就是进程硬件上下文信息，这时候就清楚了吧？在进程切换的时候，就是把寄存器信息保存到这个结构里，等切换回来的时候，再把这个结构的信息加载到寄存器里，就可以接着运行了。

多核调度

SMP结构的多核处理器比较常见，结构如下：

image.png

linux使用sched_domain数据结构描述调度层级，使用sched_group描述调度组，调度组是负载均衡调度的最小单位。

负载均衡

如何衡量一个CPU的负载呢？最简单能想到的就是CPU上就绪进程的权重之和，这样是不准的，没有考虑到进程占用CPU的方式，比如有的是io密集型，有的是计算密集型，改进下的方法是： CPU上的负载=(运行时间/总时间)*就绪队列总权重

SMP负载均衡机制从注册软中断开始，系统每次调用tick中断都会检查是否需要处理SMP负载均衡，rebalance_domains是入口：

代码语言：javascript复制

/*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
 * Balancing parameters are set up in init_sched_domains.
 */
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
    int continue_balancing = 1;
    int cpu = rq->cpu;
    int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
    unsigned long interval;
    struct sched_domain *sd;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies   60*HZ;
    int update_next_balance = 0;
    int need_serialize, need_decay = 0;
    u64 max_cost = 0;

    rcu_read_lock();
    for_each_domain(cpu, sd) {
        /*
         * Decay the newidle max times here because this is a regular
         * visit to all the domains.
         */
        need_decay = update_newidle_cost(sd, 0);
        max_cost  = sd->max_newidle_lb_cost;

        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
        if (!continue_balancing) {
            if (need_decay)
                continue;
            break;
        }

        interval = get_sd_balance_interval(sd, busy);

        need_serialize = sd->flags & SD_SERIALIZE;
        if (need_serialize) {
            if (!spin_trylock(&balancing))
                goto out;
        }

        if (time_after_eq(jiffies, sd->last_balance   interval)) {
            if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                /*
                 * The LBF_DST_PINNED logic could have changed
                 * env->dst_cpu, so we can't know our idle
                 * state even if we migrated tasks. Update it.
                 */
                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
            }
            sd->last_balance = jiffies;
            interval = get_sd_balance_interval(sd, busy);
        }
        if (need_serialize)
            spin_unlock(&balancing);
out:
        if (time_after(next_balance, sd->last_balance   interval)) {
            next_balance = sd->last_balance   interval;
            update_next_balance = 1;
        }
    }
    if (need_decay) {
        /*
         * Ensure the rq-wide value also decays but keep it at a
         * reasonable floor to avoid funnies with rq->avg_idle.
         */
        rq->max_idle_balance_cost =
            max((u64)sysctl_sched_migration_cost, max_cost);
    }
    rcu_read_unlock();

    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        rq->next_balance = next_balance;

}

本质上就是从当前CPU开始自下而上便利调度域，如果当前CPU空闲，那么找到最繁忙的调度组，然后迁移到当前CPU上。

文件存储负载均衡负载均衡缓存单片机

0 人点赞