linux内核nfs rpc task处理分析

2023-12-21 10:42:45 浏览数 (1)

本文以centos7 3.10.0-957.21.3.el7.x86_64内核为例介绍linux内核nfs v4.0 处理状态为RPC_TASK_ASYNC的async rpc task的工作机制。

一. 抓取rpc调用的相关perf event

内核rpc处理关键函数调用了trace event,因此可以通过perf event来跟踪rpc task的执行流程

代码语言:c复制
# perf list | grep sunrpc | grep task
  sunrpc:rpc_task_begin                              [Tracepoint event]
  sunrpc:rpc_task_complete                           [Tracepoint event]
  sunrpc:rpc_task_run_action                         [Tracepoint event]
  sunrpc:rpc_task_sleep                              [Tracepoint event]
  sunrpc:rpc_task_wakeup                             [Tracepoint event]
  
  
  DECLARE_EVENT_CLASS(rpc_task_queued,

        TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q),

        TP_ARGS(clnt, task, q),

        TP_STRUCT__entry(
                __field(unsigned int, task_id)
                __field(unsigned int, client_id)
                __field(unsigned long, timeout)
                __field(unsigned long, runstate)
                __field(int, status)
                __field(unsigned short, flags)
                __string(q_name, rpc_qname(q))
                ),

        TP_fast_assign(
                __entry->client_id = clnt->cl_clid;
                __entry->task_id = task->tk_pid;
                __entry->timeout = task->tk_timeout;
                __entry->runstate = task->tk_runstate;
                __entry->status = task->tk_status;
                __entry->flags = task->tk_flags;
                __assign_str(q_name, rpc_qname(q));
                ),

        TP_printk("task:%u@%u flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s",
                __entry->task_id, __entry->client_id,
                __entry->flags,
                __entry->runstate,
                __entry->status,
                __entry->timeout,
                __get_str(q_name)
                )
);

可以通过perf 抓取相关event来看下async rpc task是如何被处理的:

代码语言:c复制
下载nfstest测试工具,源码保存到~/nfstest目录下
https://github.com/imp/nfstest.git

挂载nfs v4.0
# mount | grep nfs
10.0.2.48:/ on /data type nfs4 (rw,relatime,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=10.0.2.32,local_lock=none,addr=10.0.2.48)

启动perf抓取相关rpc event:
perf record -e sunrpc:rpc_task_run_action -e sunrpc:rpc_task_complete -e sunrpc:rpc_task_wakeup -e sunrpc:rpc_task_begin -ag
设置环境变量并运行nfstest_io访问nfs挂载目录/data:
export PYTHONPATH=~/nfstest
cd /root/nfstest/test
./nfstest_io -d /data -v info -s 1234567 -n 40 -e --rdwronly -r 30 --fsizeavg=4k --fsizedev=0 --odgrade=0 --osync=0 --fsync=0 --write=100 --minfiles 100000 --createlog

二. 分析async rpc task的执行流程

执行结束后终止perf运行,并通过perf script分析抓取的perf event,这里以抓取的perf信息 task->tk_pid为task:48955@1

为例来分析介绍async rpc task的处理过程。

1. 用户态进程发起open nfs操作触发async rpc task请求:

代码语言:c复制

                                                                                       
python 19091 [001] 1306842.235106:      sunrpc:rpc_task_begin: task:0@1 flags=4081 state=0000 status=0 action=(nil)f
        ffffffffc03cefc5 rpc_execute 0xa5 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03be786 rpc_run_task 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc047d1f7 nfs4_run_open_task 0x117 ([nfsv4])
        ffffffffc0484ac1 _nfs4_open_and_get_state 0x71 ([nfsv4])
        ffffffffc0485120 nfs4_do_open 0x1d0 ([nfsv4])
        ffffffffc0485677 nfs4_atomic_open 0xf7 ([nfsv4])
        ffffffffc043c2f7 nfs_atomic_open 0x197 ([nfs])
        ffffffff87e50563 do_last 0xa53 ([kernel.kallsyms])
        ffffffff87e52bb7 path_openat 0xd7 ([kernel.kallsyms])
        ffffffff87e545bd do_filp_open 0x4d ([kernel.kallsyms])
        ffffffff87e40717 do_sys_open 0x137 ([kernel.kallsyms])
        ffffffff87e4083e sys_open 0x1e ([kernel.kallsyms])
        ffffffff8837606b tracesys 0xa3 ([kernel.kallsyms])
            7f8d50ec2efd [unknown] (/usr/lib64/libpthread-2.17.so)
                 13b9010 [unknown] ([unknown])

 
/*当用户态进程触发了一个async rpc task时,进程通过系统调用进入内核态后
会先调用rpc_run_task申请和初始化一个rpc task,为该rpc task注册一个
工作任务,该工作任务的处理函数为rpc_async_schedule,然后将该工作任务
加入到rpciod_workqueue工作队列,并通过queue_work唤醒kworker处理该
rpc task的工作任务函数rpc_async_schedule。*/

static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
{
       ......
       ......
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
        status = rpc_wait_for_completion_task(task);
        if (status != 0) {
                data->cancelled = 1;
                smp_wmb();
        } else
                status = data->rpc_status;
        rpc_put_task(task);

        return status;
}
  
/**     
 * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
 * @task_setup_data: pointer to task initialisation data
 */             
struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
{                                   
        struct rpc_task *task;
//rpc_new_task会申请一个rpc task,然后再调用rpc_init_task进行初始化        
        task = rpc_new_task(task_setup_data);
        if (IS_ERR(task))
                goto out;

        rpc_task_set_client(task, task_setup_data->rpc_client);
        rpc_task_set_rpc_message(task, task_setup_data->rpc_message);

        if (task->tk_action == NULL)
                rpc_call_start(task);

        atomic_inc(&task->tk_count);
        rpc_execute(task);
out:
        return task;
}
        
/*
 * Creation and deletion of RPC task structures
 */
static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
{
        memset(task, 0, sizeof(*task));
        atomic_set(&task->tk_count, 1);
        task->tk_flags  = task_setup_data->flags;
        task->tk_ops = task_setup_data->callback_ops;
        task->tk_calldata = task_setup_data->callback_data;
        INIT_LIST_HEAD(&task->tk_task);

        task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
        task->tk_owner = current->tgid;//记录触发该rpc task的进程pid

        //async task的rpc_task的tk_workqueue是动态变化的,
        //当async rpc task还未被从client端发送时,async rpc task
        //会被放在一个名为的rpc_wait_queue.name为Seqid_waitqueue等待队列中
        /* Initialize workqueue for async tasks */
        task->tk_workqueue = task_setup_data->workqueue;

        task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
                        xprt_get(task_setup_data->rpc_xprt));
                
        task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
        
        if (task->tk_ops->rpc_call_prepare != NULL)
                task->tk_action = rpc_prepare_task;//这里会被执行
        
        //rpc_init_task_statistics会执行task->tk_start = ktime_get();将
        //当前时间记录到task->tk_start,通过该时间可以知道rpc task创建时间

        rpc_init_task_statistics(task);

        dprintk("RPC:       new task initialized, procpid %un",
                                task_pid_nr(current));
}
        
void rpc_execute(struct rpc_task *task)
{
        bool is_async = RPC_IS_ASYNC(task);

        rpc_set_active(task);//设置task->tk_runstate位RPC_TASK_ACTIVE为1
        //为该rpc task注册一个工作任务,该工作任务的处理函数为rpc_async_schedule,
        //然后将该工作任务加入到rpciod_workqueue工作队列,并通过queue_work唤醒kworker处理该
        //rpc task的工作任务函数rpc_async_schedule。
        rpc_make_runnable(rpciod_workqueue, task);
        if (!is_async)
                __rpc_execute(task);//async rpc task不会执行到这里
}
                


static void rpc_set_active(struct rpc_task *task)
{
        trace_rpc_task_begin(task->tk_client, task, NULL);

        rpc_task_set_debuginfo(task);
        set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
}
 
/*为rpc task注册一个工作任务,该工作任务的处理函数为rpc_async_schedule,
  然后将该工作任务加入到rpciod_workqueue工作队列,并通过queue_work唤醒
  kworker处理该async rpc task的工作任务函数rpc_async_schedule*/      
static void rpc_make_runnable(struct workqueue_struct *wq,
                struct rpc_task *task)
{
        bool need_wakeup = !rpc_test_and_set_running(task);

        rpc_clear_queued(task);
        if (!need_wakeup)
                return;
        if (RPC_IS_ASYNC(task)) {
                INIT_WORK(&task->u.tk_work, rpc_async_schedule);
                queue_work(wq, &task->u.tk_work);
        } else
                wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
        
     
/*用户进程调用rpc_make_runnable通过queue_work唤醒内核kworker工作线程来处理自己
访问nfs触发的async rpc task后的工作任务后就会返回到nfs4_run_open_task执行rpc_wait_for_completion_task
进入休眠等待状态,直到自己发起访问nfs触发的async rpc task后被唤醒*/             

static inline int rpc_wait_for_completion_task(struct rpc_task *task)
{
        return __rpc_wait_for_completion_task(task, NULL);
}
        
/*以每个rpc task的task->tk_runstate地址和RPC_TASK_ACTIVE来申请和初始化一个
  绑定特定task的等待队列*/  
int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
{
        if (action == NULL)
                action = rpc_wait_bit_killable;
        return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
                        action, TASK_KILLABLE);
}                             
/*用户进程休眠阻塞等待直到其访问nfs触发发起的rpc task被nfs server端处理完成并收到响应后才会被唤醒*/
int __sched out_of_line_wait_on_bit(void *word, int bit,
                                    wait_bit_action_f *action, unsigned mode)
{
        wait_queue_head_t *wq = bit_waitqueue(word, bit);
        DEFINE_WAIT_BIT(wait, word, bit);

        return __wait_on_bit(wq, &wait, action, mode);
}


                                                                                       

2. 被唤醒的kworker进程处理async rpc task的工作任务将async rpc task放入名为的Seqid_waitqueue

的等待队列中等待发送。

代码语言:c复制
kworker/u4:3 30791 [001] 1306842.241001: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=rpc_prepare_task
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
        
       
在前面第1点中提到当进程访问nfs时进入内核调用
nfs4_run_open_task->rpc_run_task ->rpc_execute->rpc_make_runnable(rpciod_workqueue, task)路径
中rpc_make_runnable初始化会给async rpc task 注册一个工作任务处理函数rpc_async_schedule后
添加到rpciod_workqueue工作队列, 然后唤醒内核线程处理rpciod_workqueue工作队列上的async rpc task
工作任务处理函数rpc_async_schedule。

相关调用路径如下,task->tk_action对应rpc_task.tk_action,在前面第1点介绍初始化rpc task时
rpc_task.tk_action被赋值为rpc_prepare_task,最终调用到__rpc_add_wait_queue 将
async task放入到Seqid_waitqueue等待队列中等待发送,__rpc_add_wait_queue还会
调用rpc_set_queued(task)设置rpc_task.tk_runstate的RPC_TASK_QUEUED bit位。
rpc_async_schedule->
       __rpc_execute->
              (task->tk_action):rpc_prepare_task->
                          rpc_prepare_task->
                                 nfs4_open_prepare->
                                        nfs_wait_on_sequence->
                                                  rpc_sleep_on->
                                                      __rpc_sleep_on_priority->
                                                             __rpc_sleep_on_priority->
                                                                    __rpc_add_wait_queue 
          
          
 static void rpc_async_schedule(struct work_struct *work)
{
        __rpc_execute(container_of(work, struct rpc_task, u.tk_work));
}

__rpc_execute重点关注RPC_IS_QUEUED(task)和task->tk_action,这两个在rpc task的处理
过程中是会变化的:
 /*
 * This is the RPC `scheduler' (or rather, the finite state machine).
 */
static void __rpc_execute(struct rpc_task *task)
{
       ......
       ......
        //前面第1点调用rpc_make_runnable时执行了rpc_clear_queued(task);
        //所以这时RPC_IS_QUEUED(task)不成立,继续往下执行
        if (RPC_IS_QUEUED(task))
                return;

        for (;;) {
                void (*do_action)(struct rpc_task *);

                /*
                 * Execute any pending callback first.
                 */
                do_action = task->tk_callback;//此时task->tk_callback为NULL
                task->tk_callback = NULL;
                if (do_action == NULL) {
                        /*
                         * Perform the next FSM step.
                         * tk_action may be NULL if the task has been killed.
                         * In particular, note that rpc_killall_tasks may
                         * do this at any time, so beware when dereferencing.
                         */
                        do_action = task->tk_action;//rpc_prepare_task
                        if (do_action == NULL)
                                break;
                }
                trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
                do_action(task);//rpc_prepare_task

                /*
                 * Lockless check for whether task is sleeping or not.
                 */
                // rpc_prepare_task最终调用到__rpc_add_wait_queue函数会设置
                //rpc_task.tk_runstate的RPC_TASK_QUEUED bit位,因此此时RPC_IS_QUEUED(task)
                 // 返回值为1, if条件不成立,不执行continue,继续往下执行
                if (!RPC_IS_QUEUED(task))
                        continue;
                 .......
                 .......
                if (task_is_async)//执行到这里是async rpc task这里返回退出__rpc_execute
                        return;

                .......
                .......
        }

        dprintk("RPC: %5u return %d, status %dn", task->tk_pid, status,
                        task->tk_status);
        /* Release all resources associated with the task */
        rpc_release_task(task);
}                 
         

3. 当上一个rpc task发送给server端并收到响应时最终会唤醒该被响应的rpc task对应的用户态进程

代码语言:c复制
当rpc task收到server端的响应后会被从xprt_pending等待队列中移除,
并唤醒内核线程处理关联该rpc task的工作任务处理rpc_async_schedule函数
kworker/1:1H   271 [001] 1306842.242928:     sunrpc:rpc_task_wakeup: task:48954@1 flags=4881 state=000e status=0 timeout=60000 queue=xprt_pending
        ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03c2ba7 xprt_complete_rqst 0x137 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03c9240 xs_tcp_data_recv 0x740 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff882919db tcp_read_sock 0xab ([kernel.kallsyms])
        ffffffffc03c5b03 xs_tcp_data_receive_workfn 0xb3 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
      

  
/*__rpc_do_wake_up_task_on_wq调用__rpc_remove_wait_queue将已经收到响应的rpc task从
xprt_pending等待队列移出,rpc_make_runnable调用rpc_clear_queued(task)对该rpc task的
rpc_task.tk_runstate的RPC_TASK_QUEUED 位清0,并且为该rpc task注册一个工作任务加入工作队列
rpciod_workqueue中,并调用queue_work唤醒内核线程执行该rpc task的工作任务处理函数rpc_async_schedule。*/
           
 xprt_complete_rqst->
    rpc_wake_up_queued_task->
         rpc_wake_up_task_queue_locked->
                rpc_wake_up_task_on_wq_queue_locked->
                        rpc_wake_up_task_on_wq_queue_locked->
                                __rpc_do_wake_up_task_on_wq->
                                     rpc_make_runnable
                                           
 void xprt_complete_rqst(struct rpc_task *task, int copied)
{
        struct rpc_rqst *req = task->tk_rqstp;
        struct rpc_xprt *xprt = req->rq_xprt;
        
        dprintk("RPC: %5u xid x complete (%d bytes received)n",
                        task->tk_pid, ntohl(req->rq_xid), copied);
        trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
                
        xprt->stat.recvs  ;
        /*rq_rtt记录rpc task发送出去到收到nfs server响应的耗时,
        rq_xtime时间在发送rpc task的xprt_transmit函数中记录*/
        req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
        if (xprt->ops->timer != NULL)
                xprt_update_rtt(task);

        list_del_init(&req->rq_list);
        req->rq_private_buf.len = copied;
        /* Ensure all writes are done before we update */
        /* req->rq_reply_bytes_recvd */
        smp_wmb();
        req->rq_reply_bytes_recvd = copied;
        /*唤醒内核线程处理rpc_wait_queue.name为xprt_pending的等待队列,当
        rpc task被client端发送出去后就会加入该等待,当收到server端对该rpc task
        的响应时这里最终会调用到__rpc_do_wake_up_task_on_wq将rpc task从
        xprt->pending等待队列移移除,然后调用rpc_make_runnable将rpc_task.tk_runstate
        的RPC_TASK_QUEUED bit位清0. rpc_make_runnable还会再次给rpc task注册一个工作
        任务,工作任务处理函数为rpc_async_schedule,并且会唤醒内核线程来调用执行
        该工作任务处理函数rpc_async_schedule。*/
        
        /*已发送的rpc task的rpc_task.tk_waitqueue跟&xprt->pending实际是同一个等待队列*/
        rpc_wake_up_queued_task(&xprt->pending, task);
} 

 
                
/*       
 * Wake up a queued task while the queue lock is being held
 */
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{       
        rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
}               

/*      
 * Wake up a task on a specific queue
 */     
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
{       
        spin_lock_bh(&queue->lock); 
        rpc_wake_up_task_queue_locked(queue, task);
        spin_unlock_bh(&queue->lock);
}       
 
static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
                struct rpc_wait_queue *queue,
                struct rpc_task *task)
{
        dprintk("RPC: %5u __rpc_wake_up_task (now %lu)n",
                        task->tk_pid, jiffies);

        /* Has the task been executed yet? If not, we cannot wake it up! */
        if (!RPC_IS_ACTIVATED(task)) {
                printk(KERN_ERR "RPC: Inactive task (%p) being woken up!n", task);
                return;
        }

        trace_rpc_task_wakeup(task->tk_client, task, queue);

        __rpc_remove_wait_queue(queue, task);

        rpc_make_runnable(wq, task);

        dprintk("RPC:       __rpc_wake_up_task donen");
}

static void rpc_make_runnable(struct workqueue_struct *wq,
                struct rpc_task *task)
{
        bool need_wakeup = !rpc_test_and_set_running(task);

        rpc_clear_queued(task);
        if (!need_wakeup)
                return;
        if (RPC_IS_ASYNC(task)) {
                INIT_WORK(&task->u.tk_work, rpc_async_schedule);
                queue_work(wq, &task->u.tk_work);
        } else
                wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
                
                
                                

已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
                                       
 rpc_async_schedule->
          __rpc_execute->
               ->(task->tk_action):call_status
               ->(task->tk_action):call_decode
               ->(task->tk_action):rpc_exit_task
                                     ->(task->tk_ops->rpc_call_done):nfs4_open_done
               ->rpc_release_task
                        ->rpc_complete_task
                       
                         
static void __rpc_execute(struct rpc_task *task)
{
        struct rpc_wait_queue *queue;
        int task_is_async = RPC_IS_ASYNC(task);
        int status = 0;

        dprintk("RPC: %5u __rpc_execute flags=0x%xn",
                        task->tk_pid, task->tk_flags);

        WARN_ON_ONCE(RPC_IS_QUEUED(task));
        /*已发送的rpc task收到响应后在xprt_complete_rqst调用路径最终会
        调用rpc_make_runnable将rpc_task.tk_runstate的RPC_TASK_QUEUED bit位清0,
        所以这时RPC_IS_QUEUED不成立,继续往下执行*/
        if (RPC_IS_QUEUED(task))
                return;

        for (;;) {
                void (*do_action)(struct rpc_task *);

                /*
                 * Execute any pending callback first.
                 */
                do_action = task->tk_callback;
                task->tk_callback = NULL;
                if (do_action == NULL) {
                        /*
                         * Perform the next FSM step.
                         * tk_action may be NULL if the task has been killed.
                         * In particular, note that rpc_killall_tasks may
                         * do this at any time, so beware when dereferencing.
                         */
                        do_action = task->tk_action;
                        //rpc_exit_task会将task->tk_action设置为NULL后这条件成立
                        跳出for循环
                        if (do_action == NULL)
                                break;
                }
                trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
                /*已发送并收到响应的rpc task会依次执行call_status->call_decode->rpc_exit_task,
                直到rpc_exit_task将task->tk_action设置为NULL*/
                do_action(task);

                /*
                 * Lockless check for whether task is sleeping or not.
                 */
                if (!RPC_IS_QUEUED(task))//已经收到响应的rpc task这里条件成立执行continue
                        continue;
                .......
                .......
                }
       
        dprintk("RPC: %5u return %d, status %dn", task->tk_pid, status,
                        task->tk_status);
        /* Release all resources associated with the task */
        rpc_release_task(task);//调用rpc_complete_task->__wake_up_locked_key唤醒用户态进程
}                 


rpc_exit_task会将task->tk_action设置为NULL,
使得跳出__rpc_execute的for循环并执行rpc_release_task后从__rpc_execute返回    

void rpc_exit_task(struct rpc_task *task)
{
        task->tk_action = NULL;
        if (task->tk_ops->rpc_call_done != NULL) {
                task->tk_ops->rpc_call_done(task, task->tk_calldata);//nfs4_open_done
                if (task->tk_action != NULL) {
                        WARN_ON(RPC_ASSASSINATED(task));
                        /* Always release the RPC slot and buffer memory */
                        xprt_release(task);
                        rpc_reset_task_statistics(task);
                }
        }
}

/*
 * Mark an RPC call as having completed by clearing the 'active' bit
 * and then waking up all tasks that were sleeping.
 */     
static int rpc_complete_task(struct rpc_task *task)
{       
        void *m = &task->tk_runstate;
        wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE);
        struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE);
        unsigned long flags;
        int ret;

        trace_rpc_task_complete(task->tk_client, task, NULL);

        spin_lock_irqsave(&wq->lock, flags);
        clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
        ret = atomic_dec_and_test(&task->tk_count);
        if (waitqueue_active(wq))
                __wake_up_locked_key(wq, TASK_NORMAL, &k);
        spin_unlock_irqrestore(&wq->lock, flags);
        return ret;
}      

4. 前面第1点介绍访问nfs的用户态进程生成一个async rpc task后会阻塞休眠在以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。然后唤醒内核线程,第2点介绍内核线程会调用rpc_prepare_task将 async task放入到名为Seqid_waitqueue的rpc_task.tk_waitqueue等待队列中,那这个rpc task什么时候会被发送出去,并移到名为xprt_pending的已被发送

队列&xprt->pending等待nfs server端响应呢?

第3点介绍当上一个已被发送的rpc task收到响应后,休眠阻塞等待在以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列的用户态进程会被唤醒,用户态进程被唤醒后就会处理名为Seqid_waitqueue的rpc_task.tk_waitqueue等待队列的下一个

async rpc task。 比如第3点中task:48954@1的rpc task处理完成后用户态进程最终被唤醒,唤醒后会触发工作线程处理下一个

已在Seqid_waitqueue等待队列的async rpc task,将下一个待处理的async rpc task从Seqid_waitqueue队列移出并唤醒内核线程处理工作任务回调函数rpc_async_schedule。

代码语言:c复制

                 
python 19108 [001] 1306842.242959:     sunrpc:rpc_task_wakeup: task:48955@1 flags=4081 state=0006 status=0 timeout=0 queue=Seqid_waitqueue
        ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc04988bb nfs_release_seqid 0x5b ([nfsv4])
        ffffffffc04815c2 nfs4_opendata_to_nfs4_state 0x1f2 ([nfsv4])
        ffffffffc0484b24 _nfs4_open_and_get_state 0xd4 ([nfsv4])
        ffffffffc0485120 nfs4_do_open 0x1d0 ([nfsv4])
        ffffffffc0485677 nfs4_atomic_open 0xf7 ([nfsv4])
        ffffffffc043c2f7 nfs_atomic_open 0x197 ([nfs])
        ffffffff87e50563 do_last 0xa53 ([kernel.kallsyms])
        ffffffff87e52bb7 path_openat 0xd7 ([kernel.kallsyms])
        ffffffff87e545bd do_filp_open 0x4d ([kernel.kallsyms])
        ffffffff87e40717 do_sys_open 0x137 ([kernel.kallsyms])
        ffffffff87e4083e sys_open 0x1e ([kernel.kallsyms])
        ffffffff8837606b tracesys 0xa3 ([kernel.kallsyms])
            7f8d50ec2efd [unknown] (/usr/lib64/libpthread-2.17.so)
                 13b9010 [unknown] ([unknown])

task:48954@1的rpc task处理完成后用户态进程最终被唤醒,唤醒后通过如下调用路径将要处理的
下一个async rpc task(这里是task:48955@1)从Seqid_waitqueue等待队列中移出,并通过
rpc_make_runnable为即将要发送的async rpc task注册一个工作任务,工作任务处理函数为rpc_async_schedule
,将该工作任务加入rpciod_workqueue工作队列并唤醒内核线程调用rpc_async_schedule。

代码调用栈路径如下:                             
 _nfs4_open_and_get_state->
       _nfs4_opendata_to_nfs4_state->
              nfs_release_seqid->
                  rpc_wake_up_queued_task-》
                        rpc_wake_up_task_queue_locked->
                            rpc_wake_up_task_on_wq_queue_locked->
                                   __rpc_do_wake_up_task_on_wq->
                                        __rpc_remove_wait_queue
                                        rpc_make_runnable  
                                        
 
从nfs_release_seqid函数中可以看到调用rpc_wake_up_queued_task时传入
的第一个参数是&sequence->wait作为rpc_wait_queue等待队列,
实际上未发送的async rpc task其rpc_task.tk_waitqueue跟&sequence->wait指向
的是同一个名为Seqid_waitqueue的队列。

当async rpc task被发送出去后,rpc_task.tk_waitqueue会被修改为名为xprt_pending的
rpc_wait_queue等待队列。&sequence->wait会保持不变还是指向名为Seqid_waitqueue的
rpc_wait_queue等待队列。                            
void nfs_release_seqid(struct nfs_seqid *seqid)
{
        struct nfs_seqid_counter *sequence;

        if (seqid == NULL || list_empty(&seqid->list))
                return;
        sequence = seqid->sequence;
        spin_lock(&sequence->lock);
        list_del_init(&seqid->list);
        if (!list_empty(&sequence->list)) {
                struct nfs_seqid *next;

                next = list_first_entry(&sequence->list,
                                struct nfs_seqid, list);
                rpc_wake_up_queued_task(&sequence->wait, next->task);
        }
        spin_unlock(&sequence->lock);
}                                                                               
                                        

5. 在第4点中介绍到当上一个rpc task被响应后,已被响应的rpc task关联的进程会被唤醒并为待发送等待队列Seqid_waitqueue里的下一个async rpc task注册一个工作任务并唤醒内核线程处理该即将发送的rpc task。

到这一步内核线程调用rpc_async_schedule完成将async rpc task从client端发送出去。

代码语言:c复制
rpc_async_schedule将rpc task发送出去并将该已发送成功的
rpc task放入名为xprt_pending的rpc_wait_queue等待队列&xprt->pending中,
在__rpc_add_wait_queue函数中会将rpc_task.tk_waitqueue改为指向&xprt->pending。

call_transmit调用xprt_transmit完成rpc task的发送返回call_transmit后
会调用call_transmit_status将task->tk_action设置为call_status,后面第6点
中当该rpc task收到server端的响应后会调用该call_status。


被唤醒的内核线程通过如下调用路径如下:
 rpc_async_schedule->
     __rpc_execute->
       ->(task->tk_action):rpc_prepare_task
              ->nfs4_open_prepare->
                    nfs4_setup_sequence->
                             rpc_call_start->
                                 ->(task->tk_action):call_start 
                                 ->(task->tk_action):call_reserve  
                                 ->(task->tk_action):call_reserveresult 
                                 ->(task->tk_action):call_refresh 
                                 ->(task->tk_action):call_refreshresult 
                                 ->(task->tk_action):call_allocate 
                                 ->(task->tk_action):call_bind 
                                 ->(task->tk_action):call_connect 
                                  ->(task->tk_action):call_transmit//这里会将task->tk_action改为call_status
                                       ->xprt_transmit//发送rpc task并记录一些stat和发送时间rq_xtime
                                             ->xprt->ops->send_request(task):xs_tcp_send_request//发送rpc_task  
                                       //将即将发送的rpc task放入xprt->pending队列       
                                       ->rpc_sleep_on(&xprt->pending, task, xprt_timer);
                                           ->__rpc_sleep_on_priority
                                           
 
 call_transmit调用xprt_transmit发送rpc task并执行__rpc_add_wait_queue调用
 rpc_set_queued(task)设置tk_runstate的RPC_TASK_QUEUED位为1,
 发送完rpc task后返回call_transmit调用call_transmit_status将task->tk_action改为call_status。
 
 
 __rpc_sleep_on_priority会执行将rpc task放入名为xprt_pending的rpc_wait_queue
 等待队列,并设置task->tk_callback 为xprt_timer;
 __rpc_add_timer为已经发送的rpc task设置超时时间为rpc_task.tk_timeout(60000个jiffies)
 的超时定时器并添加到rpc_wait_queue等待队列timer_list链表rpc_wait_queue.timer_list,
 定时器超时回调函数为__rpc_queue_timer_fn
 
 static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
                struct rpc_task *task,
                rpc_action action,
                unsigned char queue_priority)
{
        dprintk("RPC: %5u sleep_on(queue "%s" time %lu)n",
                        task->tk_pid, rpc_qname(q), jiffies);

        trace_rpc_task_sleep(task->tk_client, task, q);
        /*__rpc_add_wait_queue调用rpc_set_queued(task);设置tk_runstate的RPC_TASK_QUEUED位为1*/
        __rpc_add_wait_queue(q, task, queue_priority);

        WARN_ON_ONCE(task->tk_callback != NULL);
        task->tk_callback = action;//指向函数xprt_timer
        __rpc_add_timer(q, task);
}    

当rpc task长时间没有收到nfs server的响应后,超时定时器回调函数__rpc_queue_timer_fn
会被执行调用rpc_wake_up_task_queue_locked触发重新发送该rpc task。
static void __rpc_queue_timer_fn(unsigned long ptr)
{
        struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
        struct rpc_task *task, *n;
        unsigned long expires, now, timeo;

        spin_lock(&queue->lock);
        expires = now = jiffies;
        list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
                timeo = task->u.tk_wait.expires;
                if (time_after_eq(now, timeo)) {
                        dprintk("RPC: %5u timeoutn", task->tk_pid);
                        task->tk_status = -ETIMEDOUT;
                        rpc_wake_up_task_queue_locked(queue, task);
                        continue;
                }
                if (expires == now || time_after(expires, timeo))
                        expires = timeo;
        }
        if (!list_empty(&queue->timer_list.list))
                rpc_set_queue_timer(queue, expires);
        spin_unlock(&queue->lock);
}                                                                              
                                          
static void __rpc_execute(struct rpc_task *task)
{
    WARN_ON_ONCE(RPC_IS_QUEUED(task));
    /*在async rpc task从Seqid_waitqueue等待队列移出到被发送出去后加入
    xprt_pending等待队列期间RPC_IS_QUEUED不成立,因此不返回继续往下执行*/
        if (RPC_IS_QUEUED(task))
                return;
          for (;;) {
              ......
              ......
              do_action(task);

                /*
                 * Lockless check for whether task is sleeping or not.
                 */
                /*当do_action指向call_transmit时,因为call_transmit会在调用xprt_transmit时
                通过__rpc_add_wait_queue调用rpc_set_queued(task);设置tk_runstate的
                RPC_TASK_QUEUED位为1,因此call_transmit执行完后!RPC_IS_QUEUED(task)条件不成立,
                不执行continue*/ 
                if (!RPC_IS_QUEUED(task))
                        continue;
                 .......
                 .......
                if (task_is_async)//执行到这里退出__rpc_execute
                        return;    
                ........                   
          }      
}  
 
//call_transmit最后会调用call_transmit_status修改task->tk_action为call_status
static void
call_transmit_status(struct rpc_task *task)
{
        task->tk_action = call_status;
        ......
        ......
 }                                                                                            
                                                                                                                                                                                                                                                             
                                       
perf 抓取的trace event过滤rpc_task.tk_pid为task:48955@1的rpc task调用过程,其中
action对应的是task->tk_action指向的函数名,可以看到task->tk_action在每个阶段会被修改。  

rpc_prepare_task->nfs40_call_sync_prepare->rpc_call_start
在rpc_call_start中task->tk_action被修改为call_start,call_start将task->tk_action
改为call_reserve,call_reserve将task->tk_action改为call_reserveresult,
call_reserveresult将task->tk_action改为call_refresh,call_refresh将task->tk_action
改为call_refreshresult,call_refreshresult将task->tk_action改为call_allocate,
call_allocate将task->tk_action改为call_bind,call_bind将task->tk_action改为call_connect,
call_connect将task->tk_action改为call_transmit,call_transmit将task->tk_action改为call_status
                                                                        
kworker/u4:3 30791 [000] 1306842.242968: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=rpc_prepare_task
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242969: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_start
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242970: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_reserve
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242971: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_reserveresult
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242972: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_refresh
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
        
 kworker/u4:3 30791 [000] 1306842.242973: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_refreshresult
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242974: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_allocate
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242975: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_bind
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.242976: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_connect
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])  
        
 kworker/u4:3 30791 [000] 1306842.242977: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_transmit
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])                 

6. 已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action 指向的函数完成已被响应的rpc task的处理,最后调用rpc_complete_task来唤醒访问nfs而触发该rpc task请求的用户进程,前面介绍该用户进程完成rpc task的分配后

阻塞休眠在 以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上,详细分析可以参考前面第3点task:48954@1 被响应后的处理分析。

代码语言:c复制
当rpc task收到server端的响应后会被从xprt_pending等待队列中移除,
并唤醒内核线程处理关联该rpc task的工作任务处理rpc_async_schedule函数
        
        
kworker/1:1H   271 [001] 1306842.243140:     sunrpc:rpc_task_wakeup: task:48955@1 flags=4881 state=000e status=0 timeout=60000 queue=xprt_pending
        ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03c2ba7 xprt_complete_rqst 0x137 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03c9240 xs_tcp_data_recv 0x740 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff882919db tcp_read_sock 0xab ([kernel.kallsyms])
        ffffffffc03c5b03 xs_tcp_data_receive_workfn 0xb3 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

                        
已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
                                       
 rpc_async_schedule->
          __rpc_execute->
               ->(task->tk_action):call_status
               ->(task->tk_action):call_decode
               ->(task->tk_action):rpc_exit_task
                                     ->(task->tk_ops->rpc_call_done):nfs4_open_done
               ->rpc_release_task
                        ->rpc_complete_task
kworker/u4:3 30791 [000] 1306842.243148: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=call_status
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.243149: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=call_status
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.243150: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=132 action=call_decode
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

kworker/u4:3 30791 [000] 1306842.243152: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=rpc_exit_task
        ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
        ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
        ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
        ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
        ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])

三. nfs async rpc task异步执行流程总结

从前面的分析可以知道async rpc task都是通过rpc_async_schedule调用__rpc_execute

来完成async rpc task的处理,其中__rpc_execute需要重点关注的是rpc_task.tk_runstate的RPC_IS_QUEUED

标志位和task->tk_action函数指针的变化。 __rpc_execute 会根据RPC_IS_QUEUED(task)做不同的处理判断,

task->tk_action则是在rpc task各个处理阶段会修改指向的函数。

1. 类型rpc_wait_queue的Seqid_waitqueue与xprt_pending 介绍

async rpc task在处理过程中会涉及到两个等待队列,一个是名为Seqid_waitqueue的等待队列,Seqid_waitqueue

等待队列存放的是等待发送的async rpc task,一个是名为xprt_pending的等待队列,xprt_pending存放的是已经发送单未收到nfs sever端响应的rpctask(包含async和非async 的rpc task)。

当async rpc task在Seqid_waitqueue队列时,rpc_task.tk_waitqueue 指向Seqid_waitqueue等待队列地址。当async rpc task在xprt_pending队列时,rpc_task.tk_waitqueue 指向xprt_pending等待队列地址。rpc_wait_queue将每个rpc_task通过task->u.tk_wait.list串联起来,具体查看__rpc_add_wait_queue函数。

无论async rpc task是在Seqid_waitqueue等待队列还是xprt_pending等待队列, nfs_seqid_counter.wait 指向的都是

Seqid_waitqueue等待队列地址。rpc_task.tk_calldata为nfs4_opendata类型, 因此可通过rpc_task获取nfs_seqid_counter.wait地址 nfs4_opendata.o_arg.seqid.sequence.wait,具体查看nfs_release_seqid函数.

当async rpc task未发送出去时,rpc_task.tk_rqstp为NULL,当async rpc task已经在xprt_pending等待队列时,

rpc_task.tk_rqstp会被赋值,rpc_task.tk_rqstp.rq_xprt.pending 地址为xprt_pending等待队列地址

2. rpc_task.tk_runstate状态RPC_IS_QUEUED位处理

当调用__rpc_do_wake_up_task_on_wq将rpc task从等待队列移除后,在rpc_make_runnable函数中会

对rpc_task.tk_runstate的RPC_IS_QUEUED位清0.

当调用__rpc_add_wait_queue将rpc task加入等待队列后,会对rpc_task.tk_runstate的RPC_IS_QUEUED位设置为1.

3. rpc_task.tk_action变化过程:

task->tk_action根据处理过程进行修改,修改过程:

1) 当async rpc task被申请出来后,会将task->tk_action设置为rpc_prepare_task

2)此时task->tk_action还是rpc_prepare_task,内核kworker线程执行rpc task工作任务处理函数rpc_async_schedule将async rpc task加入Seqid_waitqueue待发送等待队列中,调用路径如下:

rpc_async_schedule->__rpc_execute->(task->tk_action):rpc_prepare_task->nfs4_open_prepare->...->__rpc_add_wait_queue

3)此时task->tk_action还是rpc_prepare_task, 内核kworker线程执行rpc task工作任务处理函数rpc_async_schedule将async rpc task发送出去并加入xprt_pending等待队列等待nfs server端响应, rpc task被发送出去后task->tk_action被改为call_status等待收到server端响应后执行。

相关调用路径:

代码语言:c复制
 rpc_async_schedule->
     __rpc_execute->
       ->(task->tk_action):rpc_prepare_task
              ->nfs4_open_prepare->
                    nfs4_setup_sequence->
                             rpc_call_start->
                                 ->(task->tk_action):call_start 
                                 ->(task->tk_action):call_reserve  
                                 ->(task->tk_action):call_reserveresult 
                                 ->(task->tk_action):call_refresh 
                                 ->(task->tk_action):call_refreshresult 
                                 ->(task->tk_action):call_allocate 
                                 ->(task->tk_action):call_bind 
                                 ->(task->tk_action):call_connect 
                                  ->(task->tk_action):call_transmit
                                       ->xprt_transmit//发送rpc task并记录一些stat和发送时间rq_xtime
                                             ->xprt->ops->send_request(task):xs_tcp_send_request//发送rpc_task   
                                        //将即将发送的rpc task放入xprt->pending队列       
                                       ->rpc_sleep_on(&xprt->pending, task, xprt_timer);
                                           ->__rpc_sleep_on_priority
                                           
  call_transmit调用xprt_transmit发送rpc task并执行__rpc_add_wait_queue调用
 rpc_set_queued(task)设置tk_runstate的RPC_TASK_QUEUED位为1,
 发送完rpc task后返回call_transmit调用call_transmit_status将task->tk_action改为call_status。                                          

4) 此时task->tk_action为call_status, 当已发送的rpc task收到nfs server端的响应后会被从xprt_pending移除,

最终内核kworker线程执行rpc task工作任务函数rpc_async_schedule完成rpc task的处理,rpc_exit_task会将rpc_task.tk_action设置为NULL

代码语言:c复制
已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
                                       
 rpc_async_schedule->
          __rpc_execute->
               ->(task->tk_action):call_status
               ->(task->tk_action):call_decode
               ->(task->tk_action):rpc_exit_task
                                     ->(task->tk_ops->rpc_call_done):nfs4_open_done
               ->rpc_release_task
                        ->rpc_complete_task

四. 几个关键指标介绍

1. task->tk_start 记录rpc task分配时间,req->rq_xtime记录rcp task被发送时间,req->rq_rtt

记录rpc task被发送到收到响应的耗时。

相关定义:

structrpc_task*task

structrpc_rqst*req = task->tk_rqstp;

task->tk_start = ktime_get()

req->rq_rtt =ktime_sub(ktime_get(), req->rq_xtime);

2. rpc_wait_queue.qlen 记录等待队列中的rpc task数量。rpc_wait_queue

.name名为Seqid_waitqueue与xprt_pending两个rpc_wait_queue类型的等待队列

各自独立统计rpc_wait_queue.qlen

0 人点赞