代码语言:javascript复制
na_ofi_fabric_open
prov_name = "tcp;ofi_rxm";
libfabric默认安装即可:
check_header: find_package(OFI 1.9 REQUIRED)
---------- bulk ----------
bulk rdma收发文件
mercury/05_bulk/client.c
./client protocal server_addr filename
main
HG_Set_log_level("debug")
state.hg_class = HG_Init(protocol, HG_FALSE)
state.hg_context = HG_Context_create(state.hg_class)
state.save_rpc_id = MERCURY_REGISTER(state.hg_class, "save", save_in_t, save_out_t, NULL)
access(save_op.filename, F_OK) 检查文件是否存在
HG_Addr_lookup1(state.hg_context, lookup_callback, &save_op, server_address, HG_OP_ID_IGNORE) -> 查地址, 然后回调
lookup_callback
save_operation* save_op = (save_operation*)(callback_info->arg); 回调通过指针传递参数
FILE* file = fopen(save_op->filename,"r");
fseek(file, 0L, SEEK_END)
save_op->size = ftell(file) 计算文件位置
fseek(file, 0L, SEEK_SET);
save_op->buffer = calloc(1, save_op->size);
fread(save_op->buffer,1,save_op->size,file) // fread和fwrite的用法详解(以数据块的形式读写文件)
HG_Create(state->hg_context, addr, state->save_rpc_id, &handle)
HG_Bulk_create(state->hg_class, 1, (void**) &(save_op->buffer), &(save_op->size), HG_BULK_READ_ONLY, &(save_op->bulk_handle));
HG_Bulk_create (void**) &(save_op->buffer) 传入文件内容buf指针地址(二级指针) -> 客户端创建BULK
hg_buld malloc/memset 0
hg_atomic_init32 ref_count 1 初始化引用计数
转分段 注册分段 hg_bulk_register_segments 最大静态段=8, HG_BULK_STATIC_MAX
hg_bulk_create_na_mem_descs 单独注册内存描述 individually 分为静态staic和动态数组dynamic
hg_bulk_register 注册段 传递索引i
NA_Mem_handle_create
mem_handler_create -> na_ofi_mem_handler_create
NA_UNUSED 编译参数 忽略编译告警
calloc
na_ofi_mem_desc s[0] 单段, d 多段
flags & 0xff 全1
NA_Mem_register
mem_register na_ofi_mem_register
requested_key 自增
NA_MEM_READ_ONLY
access = FI_REMOTE_READ | FI_WRITE 远程读和本地写
fi_mr_regv 注册内存向量
rxm_mr_regv
rxm_mr_get_msg_access
fi_mr_regv
vrb_mr_regv
vrb_mr_cache_reg
ofi_mr_rbt_find 如果有缓存就找红黑树
util_mr_cache_create
add_region
vrb_mr_reg_common
ibv_reg_mr 注册内存
md->mr_fid.key = md->mr->rkey 远程键
md->lkey = md->mr->lkey 本地键
vrb_eq_write_event 写完成事件
rxm_mr_init
fi_mr_key 取回内存key
...desc.info.fi_mr_key = 远程键
NA_Mem_handle_get_serialize_size na_ofi_mem_handle_get_serialize_size
sizeof iovec info iovcnt
HG_Forward -> 客户端调用RDMA的发送接口, 服务端调用 HG_Bulk_transfer HG_BULK_PULL 拉取数据
save_completed
while ... do -> 循环
ret = HG_Trigger(state.hg_context, 0, 1, &count)
HG_Progress(state.hg_context, 100)
ret = HG_Context_destroy(state.hg_context)
hg_return_t err = HG_Finalize(state.hg_class)
----------
server:
mercury/05_bulk/server.c
./server addr
main
HG_Init
HG_Addr_self
HG_Addr_to_string
HG_Context_create
HG_Register_data
do
HG_Trigger
HG_Progress
save
HG_Bulk_transfer(stt->hg_context, save_bulk_completed, my_rpc_state, HG_BULK_PULL, info->addr, in.bulk_handle, 0, my_rpc_state->bulk_handle, 0, my_rpc_state->size, HG_OP_ID_IGNORE) origin and local 对齐 -> 服务端通过RDMA的单边读请求, 从客户端DMA拉取数据
HG_Context_destroy
HG_Finalize
---------- bulk ----------
HG_Forward
...
hg_core_handle->ops.forward(hg_core_handle) -> hg_core_forward_na
na_ret = NA_Msg_send_unexpected(hg_core_handle->na_class,
hg_core_handle->na_context, hg_core_send_input_cb, hg_core_handle,
hg_core_handle->core_handle.in_buf, hg_core_handle->in_buf_used,
hg_core_handle->in_buf_plugin_data, hg_core_handle->na_addr,
hg_core_handle->core_handle.info.context_id, hg_core_handle->tag,
hg_core_handle->na_send_op_id);
...
na_ofi_msg_send_unexpected(na_class_t *na_class -> na_ofi_msg_send
rc = fi_senddata(ep, msg_info->buf.const_ptr, msg_info->buf_size,
msg_info->desc, msg_info->tag & NA_OFI_TAG_MASK, msg_info->fi_addr,
context);
ofi verbs;ofi_rxm://192.169.29.63:55555, class_name:ofi, protocol_name:verbs
HG_Core_init_opt2
hg_core_init
NA_Initialize_opt -> Initialize NA if not provided externally
na_plugin_check_protocol
na_ofi_check_protocol(const char *protocol_name)
uint32_t runtime_version = fi_version()
FI_MAJOR(runtime_version), FI_MINOR(runtime_version)
type = na_ofi_prov_name_to_type(protocol_name) -> static const char *const na_ofi_prov_name[] = {NA_OFI_PROV_TYPES} -> 网络配置,插件, 协议等
na_ofi_getinfo(type, NULL, &providers)
// 设置网络参数(提示信息,过滤提供者)
hints = fi_allocinfo()
hints->mode = FI_ASYNC_IOV -> FI_ASYNC_IOV 模式指示应用程序必须提供 IO 向量所需的缓冲。 设置后,应用程序不得修改长度 > 1 的 IO 向量,包括任何相关的内存描述符数组,直到关联操作完成
hints->ep_attr->type = FI_EP_RDM -> 设置端点类型为: 可靠但是连接更少的数据报类型
hints->caps = FI_MSG | FI_TAGGED | FI_RMA | FI_DIRECTED_RECV
hints->tx_attr->msg_order = FI_ORDER_SAS -> msg_order:保证具有相同标签的消息是有序的。 (FI_ORDER_SAS - 发送后发送(send after send)。如果设置,消息发送操作(包括标记的发送)将按照相对于其他消息发送的提交顺序进行传输。如果未设置,则消息发送可能会不按其提交的顺序进行传输)
hints->tx_attr->op_flags = FI_INJECT_COMPLETE -> 当可以安全地重用缓冲区时生成完成事件
hints->domain_attr->av_type = FI_AV_MAP
hints->domain_attr->resource_mgmt = FI_RM_ENABLED
hints->fabric_attr->prov_name = strdup(na_ofi_prov_name[prov_type]) -> 提供者: verbs;ofi_rxm
hints->mode |= FI_CONTEXT
hints->ep_attr->protocol = (uint32_t) na_ofi_prov_ep_proto[prov_type]
hints->caps |= na_ofi_prov_extra_caps[prov_type]
hints->domain_attr->control_progress = na_ofi_prov_progress[prov_type]
hints->domain_attr->data_progress = na_ofi_prov_progress[prov_type]
rc = fi_getinfo(NA_OFI_VERSION, node, service, flags, hints, fi_info_p)
fi_freeinfo(providers)
ret = na_private_class->na_class.ops->initialize( &na_private_class->na_class, na_info, listen) -> na_ofi_initialize(
na_ofi_prov_addr_format(prov_type, na_init_info.addr_format)
info.thread_mode = ((na_init_info.thread_mode & NA_THREAD_MODE_SINGLE)
na_ofi_class = na_ofi_class_alloc();
ret = na_ofi_verify_info
na_ofi_class->msg_recv_unexpected == na_ofi_msg_recv
na_ofi_class->opt_features |= NA_OPT_MULTI_RECV
na_ofi_class->cq_poll = na_ofi_cq_poll_no_source
na_ofi_fabric_open(prov_type, na_ofi_class->fi_info->fabric_attr, &na_ofi_class->fabric)
na_ofi_domain_open(na_ofi_class->fabric, na_init_info.auth_key,
no_wait, na_ofi_prov_flags[prov_type] & NA_OFI_SEP,
na_ofi_prov_flags[prov_type] & NA_OFI_DOM_SHARED, na_ofi_class->fi_info,
&na_ofi_class->domain)
na_ofi_class->context_max = na_init_info.max_contexts
na_ofi_class->use_sep
na_ofi_endpoint_open(na_ofi_class->fabric, na_ofi_class->domain,
na_ofi_class->no_wait, na_ofi_class->use_sep, na_ofi_class->context_max,
na_init_info.max_unexpected_size, na_init_info.max_expected_size,
na_ofi_class->fi_info, &na_ofi_class->endpoint)
pool_chunk_size
na_ofi_class->send_pool = hg_mem_pool_create(pool_chunk_size
na_ofi_class->recv_pool = hg_mem_pool_create(pool_chunk_size
for (i = 0; i < NA_OFI_ADDR_POOL_COUNT; i ) -> 地址池
struct na_ofi_addr *na_ofi_addr = na_ofi_addr_alloc(na_ofi_class)
HG_QUEUE_PUSH_TAIL(&na_ofi_class->addr_pool.queue, na_ofi_addr, entry)
ret = na_ofi_endpoint_get_src_addr(na_ofi_class)
fi_getinfo
ret = prov->provider->getinfo(version, node, service, flags, hints, &cur); -> static int rxm_getinfo
ofi_is_wildcard_listen_addr
rxm_validate_atomic_hints(hints)
ofix_getinfo(version, node, service, flags, &rxm_util_prov, hints, rxm_info_to_core, rxm_info_to_rxm, info)
...
prov_mode = ofi_mr_get_prov_mode(api_version, user_info, prov_info)
prov_mode = ofi_cap_mr_mode(user_info->caps, prov_mode)
rxm_alter_info(hints, *info)
ofi_set_prov_attr(tail->fabric_attr, prov->provider)
发送数据
fi_senddata -> rxm_senddata
rxm_get_conn(rxm_ep, dest_addr, &rxm_conn)
...
rxm_conn_progress(ep)
ret = fi_eq_read(ep->msg_eq, &event, &cm_entry -> do loop -> vrb_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags)
eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid) -> 事件队列, 参考结构: struct vrb_eq {
ret = vrb_eq_read_event(eq, event, buf, len, flags)
if (dlistfd_empty(&eq->list_head)) -> 读eq队列
rdma_get_cm_event(eq->channel, &cma_event);
vrb_eq_cm_process_event(eq, cma_event, event, buf, len) -> VERBS模块, 从事件队列中处理CM事件
RDMA_CM_EVENT_ROUTE_RESOLVED -> 客户端路由解析事件
ep->state = VRB_CONNECTING
rdma_connect(ep->id, &ep->conn_param) -> 主动发起连接请求(建连) -> int rdma_connect (struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ->
rdma_ack_cm_event(cma_event)
rxm_handle_event(ep, event, &cm_entry, ret)
rxm_process_connect(cm_entry)
domain->flow_ctrl_ops->enable(conn->msg_ep, conn->ep->msg_info->rx_attr->size / 2) -> 流控 -> static int vrb_enable_ep_flow_ctrl
ep->peer_rq_credits = 1 ep->saved_peer_rq_credits
vrb_ep2_domain(ep)->send_credits
conn->state = RXM_CM_CONNECTED -> 已建连
rxm_send_common -> rxm_send_common(struct rxm_ep *rxm_ep,
data_len = ofi_total_iov_len(iov, count)
assert(count <= rxm_ep->rxm_info->tx_attr->iov_limit) -> 限制4个iov
iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device)
if (data_len <= rxm_ep->eager_limit) -> 紧急数据(小数据)16KB=16384B, data_len=184
rxm_send_eager(rxm_ep, rxm_conn, iov, desc, count,
fi_send(rxm_conn->msg_ep, &eager_buf->pkt, total_len -> vrb_msg_ep_send
struct ibv_send_wr wr = {
.wr_id = VERBS_COMP(ep, (uintptr_t)context),
.opcode = IBV_WR_SEND,
.send_flags = VERBS_INJECT(ep, len, desc),
};
vrb_send_buf
struct ibv_sge sge = vrb_init_sge(buf, len, desc);
wr->sg_list = &sge;
wr->num_sge = 1;
vrb_post_send(ep, wr, 0)
if (!ep->sq_credits || !ep->peer_rq_credits) -> 流控
vrb_flush_cq(cq)
vrb_poll_cq(cq, &wc)
ret = ibv_poll_cq(cq->cq, 1, wc)
vrb_report_wc(cq, &wc)
(void) ofi_cq_write(&cq->util_cq, (void *) (uintptr_t) wc->wr_id,flags, len, NULL, data, 0)
ofi_cirque_commit(cq->cirq) -> 环形链表计数器 1
ep->sq_credits--
ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr)
buf转sge:
#define vrb_init_sge(buf, len, desc) (struct ibv_sge)
{ .addr = (uintptr_t) buf,
.length = (uint32_t) len,
.lkey = (desc) ? ((struct vrb_mem_desc *) (desc))->lkey : 0 }
static int rxm_send_connect(struct rxm_conn *conn)
static int rxm_open_conn
ret = fi_endpoint(domain->msg_domain, msg_info, &msg_ep, conn)
int vrb_open_ep -> vrb_get_port_space -> return RDMA_PS_TCP
int vrb_create_ep
rdma_create_id
rdma_resolve_addr -> TODO 也将此调用转换为非阻塞(使用事件通道):在运行大型 MPI 作业时,这可能是为了更好地扩展所需要的。 使其成为非阻塞意味着我们无法在 EP 启用时创建 QP。 在使用 rdma_create_qp 创建 QP 之前,我们需要等待 RDMA_CM_EVENT_ADDR_RESOLVED 事件。 它还需要一个SW接收队列来存储启用EP后应用程序发布的recvs
ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops
ep->util_ep.ep_fid.ops = &vrb_ep_base_ops
fi_ep_bind(msg_ep, &ep->msg_eq->fid, 0) -> static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
ofi_ep_bind_valid(&vrb_prov, bfid, flags)
rdma_migrate_id(ep->id, ep->eq->channel)
rxm_bind_comp(ep, msg_ep)
fi_ep_bind(msg_ep, &ep->msg_cq->fid, FI_TRANSMIT | FI_RECV)
ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags)
return fid_list_insert(&cq->ep_list
fi_enable(msg_ep); -> static int vrb_ep_enable(struct fid_ep *ep_fid)
vrb_msg_ep_get_qp_attr(ep, &attr)
ret = rdma_create_qp(ep->id, domain->pd, &attr)
conn->flow_ctrl = domain->flow_ctrl_ops->available(msg_ep) -> static bool vrb_flow_ctrl_available
ret = rxm_prepost_recv(ep, msg_ep) -> ep->msg_info->rx_attr->size = 128
rx_buf = ofi_buf_alloc(rxm_ep->rx_pool)
if (ofi_bufpool_grow(pool))
ret = ofi_bufpool_region_alloc(buf_region);
rxm_post_recv(rx_buf)
ret = (int) fi_recv(rx_buf->rx_ep, &rx_buf->pkt -> vrb_msg_ep_recv(struct fid_ep *ep_fid
struct ibv_recv_wr wr = {
.wr_id = (uintptr_t)context,
.num_sge = 1,
.sg_list = &sge,
.next = NULL,
};
ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr)
ret = ibv_post_recv(ep->ibv_qp, wr, &bad_wr)
conn->msg_ep = msg_ep
ret = rxm_init_connect_data(conn, &cm_data)
ret = fi_getopt(&conn->ep->msg_pep->fid, FI_OPT_ENDPOINT
cm_data->connect.port = ofi_addr_get_port(&conn->ep->addr.sa)
cm_data->connect.client_conn_id = rxm_conn_id(conn->peer->index) -> rx_size=128
fi_connect(conn->msg_ep, info->dest_addr, &cm_data, sizeof(cm_data)) -> vrb_msg_ep_connect(struct fid_ep *ep_fid,
vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr)
vrb_ep_prepare_rdma_cm_param(&ep->conn_param,
conn_param->responder_resources = RDMA_MAX_RESP_RES
conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH
conn_param->flow_control = 1;
conn_param->rnr_retry_count = 7; -> 无限重试
ep->conn_param.retry_count = 15
rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT) -> 触发CM事件: RDMA_CM_EVENT_ROUTE_RESOLVED
rxm_ep_settings_init
rdma write, 单边写,
NA_Put
na_ofi_put
na_ofi_rma fi_writemsg = fi_rma_op .writemsg = -> include/rdma/fi_rma.h -> fi_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
ep->rma->writemsg(ep, msg, flags)
.writemsg = vrb_msg_ep_rma_writemsg
vrb_msg_ep_rma_writemsg -> prov/verbs/src/verbs_rma.c
struct ibv_send_wr wr
wr.opcode = IBV_WR_RDMA_WRITE | wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM 写|立即数写
vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc -> ssize_t vrb_send_iovli
wr->sg_list = alloca(sizeof(*wr->sg_list) * count)
wr->sg_list[i].addr = (uintptr_t) iov[i].iov_base
wr->sg_list[i].length = iov[i].iov_len;
wr->send_flags = IBV_SEND_INLINE ?IBV_SEND_FENCE
wr->sg_list[0]
wr->num_sge =
vrb_post_send(ep, wr, flags) -> prov/verbs/src/verbs_ep.c -> ssize_t vrb_post_send
ibv_post_send(ep->ibv_qp, wr, &bad_wr)
所有标签(Flags), include/rdma/fabric.h -> #define FI_MSG (1ULL << 1) ...
服务端执行bulk拉取
HG_Bulk_transfer(hg_context_t *context
hg_bulk_transfer_na(hg_bulk_op_t op
case HG_BULK_PULL:
na_bulk_op = hg_bulk_na_get
na_ret = na_bulk_op(hg_bulk_op_id->na_class -> hg_bulk_na_get -> return NA_Get -> na_class->ops->get -> na_ofi_get(na_class_t *na_class
na_ofi_rma_common(NA_OFI_CLASS(na_class), context, NA_CB_GET,
callback, arg, fi_readmsg, "fi_readmsg", FI_COMPLETION,
(struct na_ofi_mem_handle *) local_mem_handle, local_offset,
(struct na_ofi_mem_handle *) remote_mem_handle, remote_offset, length,
(struct na_ofi_addr *) remote_addr, remote_id,
(struct na_ofi_op_id *) op_id) -> 服务端拉取数据(READ)
..
na_ofi_rma_common(struct na_ofi_class *na_ofi_class,
struct iovec *local_iov = NA_OFI_IOV
void *local_desc = fi_mr_desc(na_ofi_mem_handle_local->fi_mr)
na_ofi_iov_translate
na_ofi_rma_iov_translate
na_ofi_rma_post(na_ofi_context->fi_tx, rma_info, &na_ofi_op_id->fi_ctx)
rc = rma_info->fi_rma_op -> rxm_ep_readmsg(struct fid_ep *ep_fid,
return rxm_ep_rma_common(rxm_ep, msg, flags | rxm_ep->util_ep.tx_msg_flags, fi_readmsg, FI_READ);
rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc, msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ), rma_buf)
fi_mr_desc(((struct rxm_mr *) desc[i])->msg_mr)
rma_msg(rxm_conn->msg_ep, &msg_rma, flags) -> vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid
struct ibv_send_wr wr = {
.wr_id = VERBS_COMP_READ_FLAGS(ep, flags, (uintptr_t)msg->context),
.opcode = IBV_WR_RDMA_READ,
.wr.rdma.remote_addr = msg->rma_iov->addr,
.wr.rdma.rkey = (uint32_t)msg->rma_iov->key,
.num_sge = msg->iov_count,
};
vrb_post_send(ep, &wr, 0)
vrb_flush_cq(cq)
ep->sq_credits--
ctx->op_queue = VRB_OP_SQ
ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr) -> 提交WR,将客户端数据读过来(DMA)
wr->wr_id = (uintptr_t) ctx->user_ctx
slist_insert_tail(&ctx->entry, &ep->sq_list) -> 触发 vrb_flush_sq 遍历该发送队列 -> while (!slist_empty(&ep->sq_list))
下刷发送队列调用栈
(gdb) bt
#0 vrb_flush_sq (ep=0xb7a4d0) at prov/verbs/src/verbs_ep.c:456
#1 0x00007ffff7339f90 in vrb_ep_close (fid=0xb7a4d0) at prov/verbs/src/verbs_ep.c:582
#2 0x00007ffff7349804 in fi_close (fid=0xb7a4d0) at ./include/rdma/fabric.h:631
#3 0x00007ffff734a605 in rxm_close_conn (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:88
#4 0x00007ffff734cd88 in rxm_process_shutdown (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:768
#5 0x00007ffff734d170 in rxm_handle_event (ep=0x78b570, event=3, cm_entry=0x7fffffffd3d0, len=16) at prov/rxm/src/rxm_conn.c:830
#6 0x00007ffff734d279 in rxm_conn_progress (ep=0x78b570) at prov/rxm/src/rxm_conn.c:850
#7 0x00007ffff736256e in rxm_ep_do_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1843
#8 0x00007ffff736268a in rxm_ep_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1863
#9 0x00007ffff72d5b7d in ofi_cq_progress (cq=0x787770) at prov/util/src/util_cq.c:498
#10 0x00007ffff72d51c0 in ofi_cq_readfrom (cq_fid=0x787770, buf=0x0, count=0, src_addr=0x0) at prov/util/src/util_cq.c:257
#11 0x00007ffff72d3a56 in fi_cq_readfrom (cq=0x787770, buf=0x0, count=0, src_addr=0x0) at ./include/rdma/fi_eq.h:400
#12 0x00007ffff72d5207 in ofi_cq_read (cq_fid=0x787770, buf=0x0, count=0) at prov/util/src/util_cq.c:264
#13 0x00007ffff72dbf4f in fi_cq_read (cq=0x787770, buf=0x0, count=0) at ./include/rdma/fi_eq.h:394
#14 0x00007ffff72dc4cc in util_poll_run (poll_fid=0x783390, context=0x7fffffffdb78, count=1) at prov/util/src/util_poll.c:95
#15 0x00007ffff72dc9a0 in fi_poll (pollset=0x783390, context=0x7fffffffdb78, count=1) at ./include/rdma/fi_eq.h:335
#16 0x00007ffff72de0bf in util_wait_fd_try (wait=0x783290) at prov/util/src/util_wait.c:366
#17 0x00007ffff72dd456 in ofi_trywait (fabric=0x781b40, fids=0x7fffffffdc40, count=1) at prov/util/src/util_wait.c:72
#18 0x00000000004266de in fi_trywait (count=1, fids=0x7fffffffdc40, fabric=<optimized out>) at /usr/local/include/rdma/fi_eq.h:323
#19 na_ofi_poll_try_wait (na_class=<optimized out>, context=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:8229
#20 0x0000000000417b4a in hg_core_poll_try_wait (context=0x786c80) at /home/xb/project/mercury/src/mercury_core.c:5058
#21 hg_core_progress (context=context@entry=0x786c80, timeout_ms=100) at /home/xb/project/mercury/src/mercury_core.c:5001
#22 0x000000000042043e in HG_Core_progress (context=0x786c80, timeout_ms=timeout_ms@entry=100) at /home/xb/project/mercury/src/mercury_core.c:6530
#23 0x000000000040bf02 in HG_Progress (context=<optimized out>, timeout=timeout@entry=100) at /home/xb/project/mercury/src/mercury.c:2178
#24 0x0000000000407741 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:69
创建RDMA事件通道调用栈
#0 0x00007ffff6c966d0 in rdma_create_event_channel () from /lib64/librdmacm.so.1
#1 0x00007ffff6c967c5 in ucma_alloc_id () from /lib64/librdmacm.so.1
#2 0x00007ffff6c96821 in rdma_create_id2.part.20 () from /lib64/librdmacm.so.1
#3 0x00007ffff6c96476 in ucma_init () from /lib64/librdmacm.so.1
#4 0x00007ffff6c9697a in rdma_create_id () from /lib64/librdmacm.so.1
#5 0x00007ffff7334ebe in vrb_ifa_rdma_info (ifa=0x6784b0, dev_name=0x7fffffffd428, rai=0x7fffffffd420) at prov/verbs/src/verbs_info.c:955
#6 0x00007ffff73359e6 in vrb_getifaddrs (verbs_devs=0x7ffff776aaa0 <verbs_devs>) at prov/verbs/src/verbs_info.c:1177
#7 0x00007ffff7336048 in vrb_init_info (all_infos=0x7ffff776a5e8 <vrb_util_prov 8>) at prov/verbs/src/verbs_info.c:1378
#8 0x00007ffff73379c9 in vrb_getinfo (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd600) at prov/verbs/src/verbs_info.c:1872
#9 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd750) at src/fabric.c:1282
#10 0x00007ffff72c70c0 in ofi_get_core_info (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, util_hints=0x6681e0, base_attr=0x7ffff776b8e0 <rxm_verbs_info>, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, core_info=0x7fffffffd750)
at prov/util/src/util_attr.c:305
#11 0x00007ffff72c71da in ofix_getinfo (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, hints=0x6681e0, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, info_to_util=0x7ffff7344ff7 <rxm_info_to_rxm>, info=0x7fffffffd8c0)
at prov/util/src/util_attr.c:328
#12 0x00007ffff7345d68 in rxm_getinfo (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd8c0) at prov/rxm/src/rxm_init.c:557
#13 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd9c0) at src/fabric.c:1282
--------------------------
#14 0x000000000042a22f in na_ofi_getinfo (prov_type=prov_type@entry=NA_OFI_PROV_VERBS_RXM, info=info@entry=0x0, fi_info_p=fi_info_p@entry=0x7fffffffd9c0) at /home/xb/project/mercury/src/na/na_ofi.c:3247
#15 0x00000000004313ee in na_ofi_check_protocol (protocol_name=protocol_name@entry=0x6681a0 "verbs;ofi_rxm") at /home/xb/project/mercury/src/na/na_ofi.c:6679
#16 0x00000000004225ce in na_plugin_check_protocol (class_ops=0x44a2e0 <na_plugin_static_g>, ops_p=<synthetic pointer>, protocol_name=<optimized out>, class_name=0x668180 "ofi") at /home/xb/project/mercury/src/na/na.c:451
#17 NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:776
#18 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#19 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '