本文以centos7 3.10.0-957.21.3.el7.x86_64内核为例介绍linux内核nfs v4.0 处理状态为RPC_TASK_ASYNC的async rpc task的工作机制。
一. 抓取rpc调用的相关perf event
内核rpc处理关键函数调用了trace event,因此可以通过perf event来跟踪rpc task的执行流程
代码语言:c复制# perf list | grep sunrpc | grep task
sunrpc:rpc_task_begin [Tracepoint event]
sunrpc:rpc_task_complete [Tracepoint event]
sunrpc:rpc_task_run_action [Tracepoint event]
sunrpc:rpc_task_sleep [Tracepoint event]
sunrpc:rpc_task_wakeup [Tracepoint event]
DECLARE_EVENT_CLASS(rpc_task_queued,
TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q),
TP_ARGS(clnt, task, q),
TP_STRUCT__entry(
__field(unsigned int, task_id)
__field(unsigned int, client_id)
__field(unsigned long, timeout)
__field(unsigned long, runstate)
__field(int, status)
__field(unsigned short, flags)
__string(q_name, rpc_qname(q))
),
TP_fast_assign(
__entry->client_id = clnt->cl_clid;
__entry->task_id = task->tk_pid;
__entry->timeout = task->tk_timeout;
__entry->runstate = task->tk_runstate;
__entry->status = task->tk_status;
__entry->flags = task->tk_flags;
__assign_str(q_name, rpc_qname(q));
),
TP_printk("task:%u@%u flags=%4.4x state=%4.4lx status=%d timeout=%lu queue=%s",
__entry->task_id, __entry->client_id,
__entry->flags,
__entry->runstate,
__entry->status,
__entry->timeout,
__get_str(q_name)
)
);
可以通过perf 抓取相关event来看下async rpc task是如何被处理的:
代码语言:c复制下载nfstest测试工具,源码保存到~/nfstest目录下
https://github.com/imp/nfstest.git
挂载nfs v4.0
# mount | grep nfs
10.0.2.48:/ on /data type nfs4 (rw,relatime,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=10.0.2.32,local_lock=none,addr=10.0.2.48)
启动perf抓取相关rpc event:
perf record -e sunrpc:rpc_task_run_action -e sunrpc:rpc_task_complete -e sunrpc:rpc_task_wakeup -e sunrpc:rpc_task_begin -ag
设置环境变量并运行nfstest_io访问nfs挂载目录/data:
export PYTHONPATH=~/nfstest
cd /root/nfstest/test
./nfstest_io -d /data -v info -s 1234567 -n 40 -e --rdwronly -r 30 --fsizeavg=4k --fsizedev=0 --odgrade=0 --osync=0 --fsync=0 --write=100 --minfiles 100000 --createlog
二. 分析async rpc task的执行流程
执行结束后终止perf运行,并通过perf script分析抓取的perf event,这里以抓取的perf信息 task->tk_pid为task:48955@1
为例来分析介绍async rpc task的处理过程。
1. 用户态进程发起open nfs操作触发async rpc task请求:
代码语言:c复制
python 19091 [001] 1306842.235106: sunrpc:rpc_task_begin: task:0@1 flags=4081 state=0000 status=0 action=(nil)f
ffffffffc03cefc5 rpc_execute 0xa5 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03be786 rpc_run_task 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc047d1f7 nfs4_run_open_task 0x117 ([nfsv4])
ffffffffc0484ac1 _nfs4_open_and_get_state 0x71 ([nfsv4])
ffffffffc0485120 nfs4_do_open 0x1d0 ([nfsv4])
ffffffffc0485677 nfs4_atomic_open 0xf7 ([nfsv4])
ffffffffc043c2f7 nfs_atomic_open 0x197 ([nfs])
ffffffff87e50563 do_last 0xa53 ([kernel.kallsyms])
ffffffff87e52bb7 path_openat 0xd7 ([kernel.kallsyms])
ffffffff87e545bd do_filp_open 0x4d ([kernel.kallsyms])
ffffffff87e40717 do_sys_open 0x137 ([kernel.kallsyms])
ffffffff87e4083e sys_open 0x1e ([kernel.kallsyms])
ffffffff8837606b tracesys 0xa3 ([kernel.kallsyms])
7f8d50ec2efd [unknown] (/usr/lib64/libpthread-2.17.so)
13b9010 [unknown] ([unknown])
/*当用户态进程触发了一个async rpc task时,进程通过系统调用进入内核态后
会先调用rpc_run_task申请和初始化一个rpc task,为该rpc task注册一个
工作任务,该工作任务的处理函数为rpc_async_schedule,然后将该工作任务
加入到rpciod_workqueue工作队列,并通过queue_work唤醒kworker处理该
rpc task的工作任务函数rpc_async_schedule。*/
static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
{
......
......
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
status = rpc_wait_for_completion_task(task);
if (status != 0) {
data->cancelled = 1;
smp_wmb();
} else
status = data->rpc_status;
rpc_put_task(task);
return status;
}
/**
* rpc_run_task - Allocate a new RPC task, then run rpc_execute against it
* @task_setup_data: pointer to task initialisation data
*/
struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
{
struct rpc_task *task;
//rpc_new_task会申请一个rpc task,然后再调用rpc_init_task进行初始化
task = rpc_new_task(task_setup_data);
if (IS_ERR(task))
goto out;
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
if (task->tk_action == NULL)
rpc_call_start(task);
atomic_inc(&task->tk_count);
rpc_execute(task);
out:
return task;
}
/*
* Creation and deletion of RPC task structures
*/
static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *task_setup_data)
{
memset(task, 0, sizeof(*task));
atomic_set(&task->tk_count, 1);
task->tk_flags = task_setup_data->flags;
task->tk_ops = task_setup_data->callback_ops;
task->tk_calldata = task_setup_data->callback_data;
INIT_LIST_HEAD(&task->tk_task);
task->tk_priority = task_setup_data->priority - RPC_PRIORITY_LOW;
task->tk_owner = current->tgid;//记录触发该rpc task的进程pid
//async task的rpc_task的tk_workqueue是动态变化的,
//当async rpc task还未被从client端发送时,async rpc task
//会被放在一个名为的rpc_wait_queue.name为Seqid_waitqueue等待队列中
/* Initialize workqueue for async tasks */
task->tk_workqueue = task_setup_data->workqueue;
task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
xprt_get(task_setup_data->rpc_xprt));
task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
if (task->tk_ops->rpc_call_prepare != NULL)
task->tk_action = rpc_prepare_task;//这里会被执行
//rpc_init_task_statistics会执行task->tk_start = ktime_get();将
//当前时间记录到task->tk_start,通过该时间可以知道rpc task创建时间
rpc_init_task_statistics(task);
dprintk("RPC: new task initialized, procpid %un",
task_pid_nr(current));
}
void rpc_execute(struct rpc_task *task)
{
bool is_async = RPC_IS_ASYNC(task);
rpc_set_active(task);//设置task->tk_runstate位RPC_TASK_ACTIVE为1
//为该rpc task注册一个工作任务,该工作任务的处理函数为rpc_async_schedule,
//然后将该工作任务加入到rpciod_workqueue工作队列,并通过queue_work唤醒kworker处理该
//rpc task的工作任务函数rpc_async_schedule。
rpc_make_runnable(rpciod_workqueue, task);
if (!is_async)
__rpc_execute(task);//async rpc task不会执行到这里
}
static void rpc_set_active(struct rpc_task *task)
{
trace_rpc_task_begin(task->tk_client, task, NULL);
rpc_task_set_debuginfo(task);
set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
}
/*为rpc task注册一个工作任务,该工作任务的处理函数为rpc_async_schedule,
然后将该工作任务加入到rpciod_workqueue工作队列,并通过queue_work唤醒
kworker处理该async rpc task的工作任务函数rpc_async_schedule*/
static void rpc_make_runnable(struct workqueue_struct *wq,
struct rpc_task *task)
{
bool need_wakeup = !rpc_test_and_set_running(task);
rpc_clear_queued(task);
if (!need_wakeup)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
queue_work(wq, &task->u.tk_work);
} else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
/*用户进程调用rpc_make_runnable通过queue_work唤醒内核kworker工作线程来处理自己
访问nfs触发的async rpc task后的工作任务后就会返回到nfs4_run_open_task执行rpc_wait_for_completion_task
进入休眠等待状态,直到自己发起访问nfs触发的async rpc task后被唤醒*/
static inline int rpc_wait_for_completion_task(struct rpc_task *task)
{
return __rpc_wait_for_completion_task(task, NULL);
}
/*以每个rpc task的task->tk_runstate地址和RPC_TASK_ACTIVE来申请和初始化一个
绑定特定task的等待队列*/
int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action)
{
if (action == NULL)
action = rpc_wait_bit_killable;
return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
action, TASK_KILLABLE);
}
/*用户进程休眠阻塞等待直到其访问nfs触发发起的rpc task被nfs server端处理完成并收到响应后才会被唤醒*/
int __sched out_of_line_wait_on_bit(void *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
return __wait_on_bit(wq, &wait, action, mode);
}
2. 被唤醒的kworker进程处理async rpc task的工作任务将async rpc task放入名为的Seqid_waitqueue
的等待队列中等待发送。
代码语言:c复制kworker/u4:3 30791 [001] 1306842.241001: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=rpc_prepare_task
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
在前面第1点中提到当进程访问nfs时进入内核调用
nfs4_run_open_task->rpc_run_task ->rpc_execute->rpc_make_runnable(rpciod_workqueue, task)路径
中rpc_make_runnable初始化会给async rpc task 注册一个工作任务处理函数rpc_async_schedule后
添加到rpciod_workqueue工作队列, 然后唤醒内核线程处理rpciod_workqueue工作队列上的async rpc task
工作任务处理函数rpc_async_schedule。
相关调用路径如下,task->tk_action对应rpc_task.tk_action,在前面第1点介绍初始化rpc task时
rpc_task.tk_action被赋值为rpc_prepare_task,最终调用到__rpc_add_wait_queue 将
async task放入到Seqid_waitqueue等待队列中等待发送,__rpc_add_wait_queue还会
调用rpc_set_queued(task)设置rpc_task.tk_runstate的RPC_TASK_QUEUED bit位。
rpc_async_schedule->
__rpc_execute->
(task->tk_action):rpc_prepare_task->
rpc_prepare_task->
nfs4_open_prepare->
nfs_wait_on_sequence->
rpc_sleep_on->
__rpc_sleep_on_priority->
__rpc_sleep_on_priority->
__rpc_add_wait_queue
static void rpc_async_schedule(struct work_struct *work)
{
__rpc_execute(container_of(work, struct rpc_task, u.tk_work));
}
__rpc_execute重点关注RPC_IS_QUEUED(task)和task->tk_action,这两个在rpc task的处理
过程中是会变化的:
/*
* This is the RPC `scheduler' (or rather, the finite state machine).
*/
static void __rpc_execute(struct rpc_task *task)
{
......
......
//前面第1点调用rpc_make_runnable时执行了rpc_clear_queued(task);
//所以这时RPC_IS_QUEUED(task)不成立,继续往下执行
if (RPC_IS_QUEUED(task))
return;
for (;;) {
void (*do_action)(struct rpc_task *);
/*
* Execute any pending callback first.
*/
do_action = task->tk_callback;//此时task->tk_callback为NULL
task->tk_callback = NULL;
if (do_action == NULL) {
/*
* Perform the next FSM step.
* tk_action may be NULL if the task has been killed.
* In particular, note that rpc_killall_tasks may
* do this at any time, so beware when dereferencing.
*/
do_action = task->tk_action;//rpc_prepare_task
if (do_action == NULL)
break;
}
trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
do_action(task);//rpc_prepare_task
/*
* Lockless check for whether task is sleeping or not.
*/
// rpc_prepare_task最终调用到__rpc_add_wait_queue函数会设置
//rpc_task.tk_runstate的RPC_TASK_QUEUED bit位,因此此时RPC_IS_QUEUED(task)
// 返回值为1, if条件不成立,不执行continue,继续往下执行
if (!RPC_IS_QUEUED(task))
continue;
.......
.......
if (task_is_async)//执行到这里是async rpc task这里返回退出__rpc_execute
return;
.......
.......
}
dprintk("RPC: %5u return %d, status %dn", task->tk_pid, status,
task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);
}
3. 当上一个rpc task发送给server端并收到响应时最终会唤醒该被响应的rpc task对应的用户态进程
代码语言:c复制当rpc task收到server端的响应后会被从xprt_pending等待队列中移除,
并唤醒内核线程处理关联该rpc task的工作任务处理rpc_async_schedule函数
kworker/1:1H 271 [001] 1306842.242928: sunrpc:rpc_task_wakeup: task:48954@1 flags=4881 state=000e status=0 timeout=60000 queue=xprt_pending
ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03c2ba7 xprt_complete_rqst 0x137 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03c9240 xs_tcp_data_recv 0x740 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff882919db tcp_read_sock 0xab ([kernel.kallsyms])
ffffffffc03c5b03 xs_tcp_data_receive_workfn 0xb3 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
/*__rpc_do_wake_up_task_on_wq调用__rpc_remove_wait_queue将已经收到响应的rpc task从
xprt_pending等待队列移出,rpc_make_runnable调用rpc_clear_queued(task)对该rpc task的
rpc_task.tk_runstate的RPC_TASK_QUEUED 位清0,并且为该rpc task注册一个工作任务加入工作队列
rpciod_workqueue中,并调用queue_work唤醒内核线程执行该rpc task的工作任务处理函数rpc_async_schedule。*/
xprt_complete_rqst->
rpc_wake_up_queued_task->
rpc_wake_up_task_queue_locked->
rpc_wake_up_task_on_wq_queue_locked->
rpc_wake_up_task_on_wq_queue_locked->
__rpc_do_wake_up_task_on_wq->
rpc_make_runnable
void xprt_complete_rqst(struct rpc_task *task, int copied)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
dprintk("RPC: %5u xid x complete (%d bytes received)n",
task->tk_pid, ntohl(req->rq_xid), copied);
trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
xprt->stat.recvs ;
/*rq_rtt记录rpc task发送出去到收到nfs server响应的耗时,
rq_xtime时间在发送rpc task的xprt_transmit函数中记录*/
req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime);
if (xprt->ops->timer != NULL)
xprt_update_rtt(task);
list_del_init(&req->rq_list);
req->rq_private_buf.len = copied;
/* Ensure all writes are done before we update */
/* req->rq_reply_bytes_recvd */
smp_wmb();
req->rq_reply_bytes_recvd = copied;
/*唤醒内核线程处理rpc_wait_queue.name为xprt_pending的等待队列,当
rpc task被client端发送出去后就会加入该等待,当收到server端对该rpc task
的响应时这里最终会调用到__rpc_do_wake_up_task_on_wq将rpc task从
xprt->pending等待队列移移除,然后调用rpc_make_runnable将rpc_task.tk_runstate
的RPC_TASK_QUEUED bit位清0. rpc_make_runnable还会再次给rpc task注册一个工作
任务,工作任务处理函数为rpc_async_schedule,并且会唤醒内核线程来调用执行
该工作任务处理函数rpc_async_schedule。*/
/*已发送的rpc task的rpc_task.tk_waitqueue跟&xprt->pending实际是同一个等待队列*/
rpc_wake_up_queued_task(&xprt->pending, task);
}
/*
* Wake up a queued task while the queue lock is being held
*/
static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
{
rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
}
/*
* Wake up a task on a specific queue
*/
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
{
spin_lock_bh(&queue->lock);
rpc_wake_up_task_queue_locked(queue, task);
spin_unlock_bh(&queue->lock);
}
static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
struct rpc_wait_queue *queue,
struct rpc_task *task)
{
dprintk("RPC: %5u __rpc_wake_up_task (now %lu)n",
task->tk_pid, jiffies);
/* Has the task been executed yet? If not, we cannot wake it up! */
if (!RPC_IS_ACTIVATED(task)) {
printk(KERN_ERR "RPC: Inactive task (%p) being woken up!n", task);
return;
}
trace_rpc_task_wakeup(task->tk_client, task, queue);
__rpc_remove_wait_queue(queue, task);
rpc_make_runnable(wq, task);
dprintk("RPC: __rpc_wake_up_task donen");
}
static void rpc_make_runnable(struct workqueue_struct *wq,
struct rpc_task *task)
{
bool need_wakeup = !rpc_test_and_set_running(task);
rpc_clear_queued(task);
if (!need_wakeup)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
queue_work(wq, &task->u.tk_work);
} else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
rpc_async_schedule->
__rpc_execute->
->(task->tk_action):call_status
->(task->tk_action):call_decode
->(task->tk_action):rpc_exit_task
->(task->tk_ops->rpc_call_done):nfs4_open_done
->rpc_release_task
->rpc_complete_task
static void __rpc_execute(struct rpc_task *task)
{
struct rpc_wait_queue *queue;
int task_is_async = RPC_IS_ASYNC(task);
int status = 0;
dprintk("RPC: %5u __rpc_execute flags=0x%xn",
task->tk_pid, task->tk_flags);
WARN_ON_ONCE(RPC_IS_QUEUED(task));
/*已发送的rpc task收到响应后在xprt_complete_rqst调用路径最终会
调用rpc_make_runnable将rpc_task.tk_runstate的RPC_TASK_QUEUED bit位清0,
所以这时RPC_IS_QUEUED不成立,继续往下执行*/
if (RPC_IS_QUEUED(task))
return;
for (;;) {
void (*do_action)(struct rpc_task *);
/*
* Execute any pending callback first.
*/
do_action = task->tk_callback;
task->tk_callback = NULL;
if (do_action == NULL) {
/*
* Perform the next FSM step.
* tk_action may be NULL if the task has been killed.
* In particular, note that rpc_killall_tasks may
* do this at any time, so beware when dereferencing.
*/
do_action = task->tk_action;
//rpc_exit_task会将task->tk_action设置为NULL后这条件成立
跳出for循环
if (do_action == NULL)
break;
}
trace_rpc_task_run_action(task->tk_client, task, task->tk_action);
/*已发送并收到响应的rpc task会依次执行call_status->call_decode->rpc_exit_task,
直到rpc_exit_task将task->tk_action设置为NULL*/
do_action(task);
/*
* Lockless check for whether task is sleeping or not.
*/
if (!RPC_IS_QUEUED(task))//已经收到响应的rpc task这里条件成立执行continue
continue;
.......
.......
}
dprintk("RPC: %5u return %d, status %dn", task->tk_pid, status,
task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);//调用rpc_complete_task->__wake_up_locked_key唤醒用户态进程
}
rpc_exit_task会将task->tk_action设置为NULL,
使得跳出__rpc_execute的for循环并执行rpc_release_task后从__rpc_execute返回
void rpc_exit_task(struct rpc_task *task)
{
task->tk_action = NULL;
if (task->tk_ops->rpc_call_done != NULL) {
task->tk_ops->rpc_call_done(task, task->tk_calldata);//nfs4_open_done
if (task->tk_action != NULL) {
WARN_ON(RPC_ASSASSINATED(task));
/* Always release the RPC slot and buffer memory */
xprt_release(task);
rpc_reset_task_statistics(task);
}
}
}
/*
* Mark an RPC call as having completed by clearing the 'active' bit
* and then waking up all tasks that were sleeping.
*/
static int rpc_complete_task(struct rpc_task *task)
{
void *m = &task->tk_runstate;
wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE);
struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE);
unsigned long flags;
int ret;
trace_rpc_task_complete(task->tk_client, task, NULL);
spin_lock_irqsave(&wq->lock, flags);
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
ret = atomic_dec_and_test(&task->tk_count);
if (waitqueue_active(wq))
__wake_up_locked_key(wq, TASK_NORMAL, &k);
spin_unlock_irqrestore(&wq->lock, flags);
return ret;
}
4. 前面第1点介绍访问nfs的用户态进程生成一个async rpc task后会阻塞休眠在以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。然后唤醒内核线程,第2点介绍内核线程会调用rpc_prepare_task将 async task放入到名为Seqid_waitqueue的rpc_task.tk_waitqueue等待队列中,那这个rpc task什么时候会被发送出去,并移到名为xprt_pending的已被发送
队列&xprt->pending等待nfs server端响应呢?
第3点介绍当上一个已被发送的rpc task收到响应后,休眠阻塞等待在以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列的用户态进程会被唤醒,用户态进程被唤醒后就会处理名为Seqid_waitqueue的rpc_task.tk_waitqueue等待队列的下一个
async rpc task。 比如第3点中task:48954@1的rpc task处理完成后用户态进程最终被唤醒,唤醒后会触发工作线程处理下一个
已在Seqid_waitqueue等待队列的async rpc task,将下一个待处理的async rpc task从Seqid_waitqueue队列移出并唤醒内核线程处理工作任务回调函数rpc_async_schedule。
代码语言:c复制
python 19108 [001] 1306842.242959: sunrpc:rpc_task_wakeup: task:48955@1 flags=4081 state=0006 status=0 timeout=0 queue=Seqid_waitqueue
ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc04988bb nfs_release_seqid 0x5b ([nfsv4])
ffffffffc04815c2 nfs4_opendata_to_nfs4_state 0x1f2 ([nfsv4])
ffffffffc0484b24 _nfs4_open_and_get_state 0xd4 ([nfsv4])
ffffffffc0485120 nfs4_do_open 0x1d0 ([nfsv4])
ffffffffc0485677 nfs4_atomic_open 0xf7 ([nfsv4])
ffffffffc043c2f7 nfs_atomic_open 0x197 ([nfs])
ffffffff87e50563 do_last 0xa53 ([kernel.kallsyms])
ffffffff87e52bb7 path_openat 0xd7 ([kernel.kallsyms])
ffffffff87e545bd do_filp_open 0x4d ([kernel.kallsyms])
ffffffff87e40717 do_sys_open 0x137 ([kernel.kallsyms])
ffffffff87e4083e sys_open 0x1e ([kernel.kallsyms])
ffffffff8837606b tracesys 0xa3 ([kernel.kallsyms])
7f8d50ec2efd [unknown] (/usr/lib64/libpthread-2.17.so)
13b9010 [unknown] ([unknown])
task:48954@1的rpc task处理完成后用户态进程最终被唤醒,唤醒后通过如下调用路径将要处理的
下一个async rpc task(这里是task:48955@1)从Seqid_waitqueue等待队列中移出,并通过
rpc_make_runnable为即将要发送的async rpc task注册一个工作任务,工作任务处理函数为rpc_async_schedule
,将该工作任务加入rpciod_workqueue工作队列并唤醒内核线程调用rpc_async_schedule。
代码调用栈路径如下:
_nfs4_open_and_get_state->
_nfs4_opendata_to_nfs4_state->
nfs_release_seqid->
rpc_wake_up_queued_task-》
rpc_wake_up_task_queue_locked->
rpc_wake_up_task_on_wq_queue_locked->
__rpc_do_wake_up_task_on_wq->
__rpc_remove_wait_queue
rpc_make_runnable
从nfs_release_seqid函数中可以看到调用rpc_wake_up_queued_task时传入
的第一个参数是&sequence->wait作为rpc_wait_queue等待队列,
实际上未发送的async rpc task其rpc_task.tk_waitqueue跟&sequence->wait指向
的是同一个名为Seqid_waitqueue的队列。
当async rpc task被发送出去后,rpc_task.tk_waitqueue会被修改为名为xprt_pending的
rpc_wait_queue等待队列。&sequence->wait会保持不变还是指向名为Seqid_waitqueue的
rpc_wait_queue等待队列。
void nfs_release_seqid(struct nfs_seqid *seqid)
{
struct nfs_seqid_counter *sequence;
if (seqid == NULL || list_empty(&seqid->list))
return;
sequence = seqid->sequence;
spin_lock(&sequence->lock);
list_del_init(&seqid->list);
if (!list_empty(&sequence->list)) {
struct nfs_seqid *next;
next = list_first_entry(&sequence->list,
struct nfs_seqid, list);
rpc_wake_up_queued_task(&sequence->wait, next->task);
}
spin_unlock(&sequence->lock);
}
5. 在第4点中介绍到当上一个rpc task被响应后,已被响应的rpc task关联的进程会被唤醒并为待发送等待队列Seqid_waitqueue里的下一个async rpc task注册一个工作任务并唤醒内核线程处理该即将发送的rpc task。
到这一步内核线程调用rpc_async_schedule完成将async rpc task从client端发送出去。
代码语言:c复制rpc_async_schedule将rpc task发送出去并将该已发送成功的
rpc task放入名为xprt_pending的rpc_wait_queue等待队列&xprt->pending中,
在__rpc_add_wait_queue函数中会将rpc_task.tk_waitqueue改为指向&xprt->pending。
call_transmit调用xprt_transmit完成rpc task的发送返回call_transmit后
会调用call_transmit_status将task->tk_action设置为call_status,后面第6点
中当该rpc task收到server端的响应后会调用该call_status。
被唤醒的内核线程通过如下调用路径如下:
rpc_async_schedule->
__rpc_execute->
->(task->tk_action):rpc_prepare_task
->nfs4_open_prepare->
nfs4_setup_sequence->
rpc_call_start->
->(task->tk_action):call_start
->(task->tk_action):call_reserve
->(task->tk_action):call_reserveresult
->(task->tk_action):call_refresh
->(task->tk_action):call_refreshresult
->(task->tk_action):call_allocate
->(task->tk_action):call_bind
->(task->tk_action):call_connect
->(task->tk_action):call_transmit//这里会将task->tk_action改为call_status
->xprt_transmit//发送rpc task并记录一些stat和发送时间rq_xtime
->xprt->ops->send_request(task):xs_tcp_send_request//发送rpc_task
//将即将发送的rpc task放入xprt->pending队列
->rpc_sleep_on(&xprt->pending, task, xprt_timer);
->__rpc_sleep_on_priority
call_transmit调用xprt_transmit发送rpc task并执行__rpc_add_wait_queue调用
rpc_set_queued(task)设置tk_runstate的RPC_TASK_QUEUED位为1,
发送完rpc task后返回call_transmit调用call_transmit_status将task->tk_action改为call_status。
__rpc_sleep_on_priority会执行将rpc task放入名为xprt_pending的rpc_wait_queue
等待队列,并设置task->tk_callback 为xprt_timer;
__rpc_add_timer为已经发送的rpc task设置超时时间为rpc_task.tk_timeout(60000个jiffies)
的超时定时器并添加到rpc_wait_queue等待队列timer_list链表rpc_wait_queue.timer_list,
定时器超时回调函数为__rpc_queue_timer_fn
static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
rpc_action action,
unsigned char queue_priority)
{
dprintk("RPC: %5u sleep_on(queue "%s" time %lu)n",
task->tk_pid, rpc_qname(q), jiffies);
trace_rpc_task_sleep(task->tk_client, task, q);
/*__rpc_add_wait_queue调用rpc_set_queued(task);设置tk_runstate的RPC_TASK_QUEUED位为1*/
__rpc_add_wait_queue(q, task, queue_priority);
WARN_ON_ONCE(task->tk_callback != NULL);
task->tk_callback = action;//指向函数xprt_timer
__rpc_add_timer(q, task);
}
当rpc task长时间没有收到nfs server的响应后,超时定时器回调函数__rpc_queue_timer_fn
会被执行调用rpc_wake_up_task_queue_locked触发重新发送该rpc task。
static void __rpc_queue_timer_fn(unsigned long ptr)
{
struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
struct rpc_task *task, *n;
unsigned long expires, now, timeo;
spin_lock(&queue->lock);
expires = now = jiffies;
list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
timeo = task->u.tk_wait.expires;
if (time_after_eq(now, timeo)) {
dprintk("RPC: %5u timeoutn", task->tk_pid);
task->tk_status = -ETIMEDOUT;
rpc_wake_up_task_queue_locked(queue, task);
continue;
}
if (expires == now || time_after(expires, timeo))
expires = timeo;
}
if (!list_empty(&queue->timer_list.list))
rpc_set_queue_timer(queue, expires);
spin_unlock(&queue->lock);
}
static void __rpc_execute(struct rpc_task *task)
{
WARN_ON_ONCE(RPC_IS_QUEUED(task));
/*在async rpc task从Seqid_waitqueue等待队列移出到被发送出去后加入
xprt_pending等待队列期间RPC_IS_QUEUED不成立,因此不返回继续往下执行*/
if (RPC_IS_QUEUED(task))
return;
for (;;) {
......
......
do_action(task);
/*
* Lockless check for whether task is sleeping or not.
*/
/*当do_action指向call_transmit时,因为call_transmit会在调用xprt_transmit时
通过__rpc_add_wait_queue调用rpc_set_queued(task);设置tk_runstate的
RPC_TASK_QUEUED位为1,因此call_transmit执行完后!RPC_IS_QUEUED(task)条件不成立,
不执行continue*/
if (!RPC_IS_QUEUED(task))
continue;
.......
.......
if (task_is_async)//执行到这里退出__rpc_execute
return;
........
}
}
//call_transmit最后会调用call_transmit_status修改task->tk_action为call_status
static void
call_transmit_status(struct rpc_task *task)
{
task->tk_action = call_status;
......
......
}
perf 抓取的trace event过滤rpc_task.tk_pid为task:48955@1的rpc task调用过程,其中
action对应的是task->tk_action指向的函数名,可以看到task->tk_action在每个阶段会被修改。
rpc_prepare_task->nfs40_call_sync_prepare->rpc_call_start
在rpc_call_start中task->tk_action被修改为call_start,call_start将task->tk_action
改为call_reserve,call_reserve将task->tk_action改为call_reserveresult,
call_reserveresult将task->tk_action改为call_refresh,call_refresh将task->tk_action
改为call_refreshresult,call_refreshresult将task->tk_action改为call_allocate,
call_allocate将task->tk_action改为call_bind,call_bind将task->tk_action改为call_connect,
call_connect将task->tk_action改为call_transmit,call_transmit将task->tk_action改为call_status
kworker/u4:3 30791 [000] 1306842.242968: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=rpc_prepare_task
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242969: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_start
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242970: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_reserve
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242971: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_reserveresult
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242972: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_refresh
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242973: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_refreshresult
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242974: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_allocate
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242975: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_bind
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242976: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_connect
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.242977: sunrpc:rpc_task_run_action: task:48955@1 flags=4081 state=0005 status=0 action=call_transmit
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
6. 已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action 指向的函数完成已被响应的rpc task的处理,最后调用rpc_complete_task来唤醒访问nfs而触发该rpc task请求的用户进程,前面介绍该用户进程完成rpc task的分配后
阻塞休眠在 以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上,详细分析可以参考前面第3点task:48954@1 被响应后的处理分析。
代码语言:c复制当rpc task收到server端的响应后会被从xprt_pending等待队列中移除,
并唤醒内核线程处理关联该rpc task的工作任务处理rpc_async_schedule函数
kworker/1:1H 271 [001] 1306842.243140: sunrpc:rpc_task_wakeup: task:48955@1 flags=4881 state=000e status=0 timeout=60000 queue=xprt_pending
ffffffffc03ccee6 __rpc_do_wake_up_task_on_wq 0xf6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd570 rpc_wake_up_task_queue_locked 0x30 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd5a2 rpc_wake_up_queued_task 0x22 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03c2ba7 xprt_complete_rqst 0x137 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03c9240 xs_tcp_data_recv 0x740 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff882919db tcp_read_sock 0xab ([kernel.kallsyms])
ffffffffc03c5b03 xs_tcp_data_receive_workfn 0xb3 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
rpc_async_schedule->
__rpc_execute->
->(task->tk_action):call_status
->(task->tk_action):call_decode
->(task->tk_action):rpc_exit_task
->(task->tk_ops->rpc_call_done):nfs4_open_done
->rpc_release_task
->rpc_complete_task
kworker/u4:3 30791 [000] 1306842.243148: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=call_status
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.243149: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=call_status
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.243150: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=132 action=call_decode
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
kworker/u4:3 30791 [000] 1306842.243152: sunrpc:rpc_task_run_action: task:48955@1 flags=4881 state=0005 status=0 action=rpc_exit_task
ffffffffc03cd696 __rpc_execute 0xe6 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffffc03cd9e2 rpc_async_schedule 0x12 (/lib/modules/3.10.0-957.21.3.el7.x86_64/kernel/net/sunrpc/sunrpc.ko.xz)
ffffffff87cb9ebf process_one_work 0x17f ([kernel.kallsyms])
ffffffff87cbaf56 worker_thread 0x126 ([kernel.kallsyms])
ffffffff87cc1da1 kthread 0xd1 ([kernel.kallsyms])
ffffffff88375c1d ret_from_fork_nospec_begin 0x7 ([kernel.kallsyms])
三. nfs async rpc task异步执行流程总结
从前面的分析可以知道async rpc task都是通过rpc_async_schedule调用__rpc_execute
来完成async rpc task的处理,其中__rpc_execute需要重点关注的是rpc_task.tk_runstate的RPC_IS_QUEUED
标志位和task->tk_action函数指针的变化。 __rpc_execute 会根据RPC_IS_QUEUED(task)做不同的处理判断,
task->tk_action则是在rpc task各个处理阶段会修改指向的函数。
1. 类型rpc_wait_queue的Seqid_waitqueue与xprt_pending 介绍
async rpc task在处理过程中会涉及到两个等待队列,一个是名为Seqid_waitqueue的等待队列,Seqid_waitqueue
等待队列存放的是等待发送的async rpc task,一个是名为xprt_pending的等待队列,xprt_pending存放的是已经发送单未收到nfs sever端响应的rpctask(包含async和非async 的rpc task)。
当async rpc task在Seqid_waitqueue队列时,rpc_task.tk_waitqueue 指向Seqid_waitqueue等待队列地址。当async rpc task在xprt_pending队列时,rpc_task.tk_waitqueue 指向xprt_pending等待队列地址。rpc_wait_queue将每个rpc_task通过task->u.tk_wait.list串联起来,具体查看__rpc_add_wait_queue函数。
无论async rpc task是在Seqid_waitqueue等待队列还是xprt_pending等待队列, nfs_seqid_counter.wait 指向的都是
Seqid_waitqueue等待队列地址。rpc_task.tk_calldata为nfs4_opendata类型, 因此可通过rpc_task获取nfs_seqid_counter.wait地址 nfs4_opendata.o_arg.seqid.sequence.wait,具体查看nfs_release_seqid函数.
当async rpc task未发送出去时,rpc_task.tk_rqstp为NULL,当async rpc task已经在xprt_pending等待队列时,
rpc_task.tk_rqstp会被赋值,rpc_task.tk_rqstp.rq_xprt.pending 地址为xprt_pending等待队列地址
2. rpc_task.tk_runstate状态RPC_IS_QUEUED位处理
当调用__rpc_do_wake_up_task_on_wq将rpc task从等待队列移除后,在rpc_make_runnable函数中会
对rpc_task.tk_runstate的RPC_IS_QUEUED位清0.
当调用__rpc_add_wait_queue将rpc task加入等待队列后,会对rpc_task.tk_runstate的RPC_IS_QUEUED位设置为1.
3. rpc_task.tk_action变化过程:
task->tk_action根据处理过程进行修改,修改过程:
1) 当async rpc task被申请出来后,会将task->tk_action设置为rpc_prepare_task
2)此时task->tk_action还是rpc_prepare_task,内核kworker线程执行rpc task工作任务处理函数rpc_async_schedule将async rpc task加入Seqid_waitqueue待发送等待队列中,调用路径如下:
rpc_async_schedule->__rpc_execute->(task->tk_action):rpc_prepare_task->nfs4_open_prepare->...->__rpc_add_wait_queue
3)此时task->tk_action还是rpc_prepare_task, 内核kworker线程执行rpc task工作任务处理函数rpc_async_schedule将async rpc task发送出去并加入xprt_pending等待队列等待nfs server端响应, rpc task被发送出去后task->tk_action被改为call_status等待收到server端响应后执行。
相关调用路径:
代码语言:c复制 rpc_async_schedule->
__rpc_execute->
->(task->tk_action):rpc_prepare_task
->nfs4_open_prepare->
nfs4_setup_sequence->
rpc_call_start->
->(task->tk_action):call_start
->(task->tk_action):call_reserve
->(task->tk_action):call_reserveresult
->(task->tk_action):call_refresh
->(task->tk_action):call_refreshresult
->(task->tk_action):call_allocate
->(task->tk_action):call_bind
->(task->tk_action):call_connect
->(task->tk_action):call_transmit
->xprt_transmit//发送rpc task并记录一些stat和发送时间rq_xtime
->xprt->ops->send_request(task):xs_tcp_send_request//发送rpc_task
//将即将发送的rpc task放入xprt->pending队列
->rpc_sleep_on(&xprt->pending, task, xprt_timer);
->__rpc_sleep_on_priority
call_transmit调用xprt_transmit发送rpc task并执行__rpc_add_wait_queue调用
rpc_set_queued(task)设置tk_runstate的RPC_TASK_QUEUED位为1,
发送完rpc task后返回call_transmit调用call_transmit_status将task->tk_action改为call_status。
4) 此时task->tk_action为call_status, 当已发送的rpc task收到nfs server端的响应后会被从xprt_pending移除,
最终内核kworker线程执行rpc task工作任务函数rpc_async_schedule完成rpc task的处理,rpc_exit_task会将rpc_task.tk_action设置为NULL
代码语言:c复制已发送的rpc task收到server端响应后最终会唤醒内核工作线程来执行该rpc task的工作任务
处理函数rpc_async_schedule,其主要任务就是在__rpc_execute循环调用task->tk_action
指向的函数,call_status最终会将task->tk_action设置为call_decode,
call_decode会将task->tk_action改为rpc_exit_task,完成已被响应的rpc task的处理,rpc_exit_task
会将tk->tk_action设置为NULL使得__rpc_execute可以结束for循环。
最后__rpc_execute调用rpc_release_task->rpc_complete_task来唤醒
因访问nfs而触发该rpc task请求的用户进程,前面第1点分析介绍该用户进程阻塞休眠在
以task->tk_runstate和RPC_TASK_ACTIVE定义的等待队列上。进程被唤醒后会触发工作线程
调用rpc_async_schedule处理下一个已在Seqid_waitqueue等待队列的async rpc task。
rpc_async_schedule->
__rpc_execute->
->(task->tk_action):call_status
->(task->tk_action):call_decode
->(task->tk_action):rpc_exit_task
->(task->tk_ops->rpc_call_done):nfs4_open_done
->rpc_release_task
->rpc_complete_task
四. 几个关键指标介绍
1. task->tk_start 记录rpc task分配时间,req->rq_xtime记录rcp task被发送时间,req->rq_rtt
记录rpc task被发送到收到响应的耗时。
相关定义:
structrpc_task*task
structrpc_rqst*req = task->tk_rqstp;
task->tk_start = ktime_get()
req->rq_rtt =ktime_sub(ktime_get(), req->rq_xtime);
2. rpc_wait_queue.qlen 记录等待队列中的rpc task数量。rpc_wait_queue
.name名为Seqid_waitqueue与xprt_pending两个rpc_wait_queue类型的等待队列
各自独立统计rpc_wait_queue.qlen