DAOS_分布式存储_mercury_libfabric_rdma_rpc高性能网络_大块数据_bulk传输_中断_轮询自动切换_等笔记

2023-11-21 08:11:28 浏览数 (2)

代码语言:javascript复制
na_ofi_fabric_open

prov_name = "tcp;ofi_rxm";


libfabric默认安装即可:
check_header: find_package(OFI 1.9 REQUIRED)


---------- bulk ----------
bulk rdma收发文件

mercury/05_bulk/client.c
./client protocal server_addr filename
main
  HG_Set_log_level("debug")
  state.hg_class = HG_Init(protocol, HG_FALSE)
  state.hg_context = HG_Context_create(state.hg_class)
  state.save_rpc_id = MERCURY_REGISTER(state.hg_class, "save", save_in_t, save_out_t, NULL)
  access(save_op.filename, F_OK) 检查文件是否存在
  HG_Addr_lookup1(state.hg_context, lookup_callback, &save_op, server_address, HG_OP_ID_IGNORE) -> 查地址, 然后回调
  lookup_callback  
    save_operation* save_op = (save_operation*)(callback_info->arg); 回调通过指针传递参数
    FILE* file = fopen(save_op->filename,"r");
    fseek(file, 0L, SEEK_END)
    save_op->size = ftell(file) 计算文件位置
    fseek(file, 0L, SEEK_SET);
    save_op->buffer = calloc(1, save_op->size);
    fread(save_op->buffer,1,save_op->size,file)  // fread和fwrite的用法详解(以数据块的形式读写文件)
    HG_Create(state->hg_context, addr, state->save_rpc_id, &handle)
    HG_Bulk_create(state->hg_class, 1, (void**) &(save_op->buffer), &(save_op->size), HG_BULK_READ_ONLY, &(save_op->bulk_handle));
    HG_Bulk_create (void**) &(save_op->buffer) 传入文件内容buf指针地址(二级指针) -> 客户端创建BULK
      hg_buld malloc/memset 0
      hg_atomic_init32 ref_count 1 初始化引用计数
      转分段 注册分段 hg_bulk_register_segments 最大静态段=8, HG_BULK_STATIC_MAX
      hg_bulk_create_na_mem_descs 单独注册内存描述 individually 分为静态staic和动态数组dynamic
        hg_bulk_register 注册段 传递索引i
          NA_Mem_handle_create
            mem_handler_create -> na_ofi_mem_handler_create
              NA_UNUSED 编译参数 忽略编译告警
              calloc
              na_ofi_mem_desc s[0] 单段, d 多段
              flags & 0xff 全1
          NA_Mem_register
            mem_register na_ofi_mem_register
              requested_key 自增
              NA_MEM_READ_ONLY 
              access = FI_REMOTE_READ | FI_WRITE 远程读和本地写
              fi_mr_regv 注册内存向量
                rxm_mr_regv
                  rxm_mr_get_msg_access
                  fi_mr_regv
                    vrb_mr_regv
                      vrb_mr_cache_reg
                        ofi_mr_rbt_find 如果有缓存就找红黑树
                        util_mr_cache_create
                        add_region
                          vrb_mr_reg_common
                            ibv_reg_mr 注册内存
                            md->mr_fid.key = md->mr->rkey 远程键
                            md->lkey = md->mr->lkey 本地键
                            vrb_eq_write_event 写完成事件
                  rxm_mr_init
              fi_mr_key 取回内存key
              ...desc.info.fi_mr_key = 远程键
          NA_Mem_handle_get_serialize_size na_ofi_mem_handle_get_serialize_size
            sizeof iovec info   iovcnt
    HG_Forward -> 客户端调用RDMA的发送接口, 服务端调用 HG_Bulk_transfer HG_BULK_PULL 拉取数据
    save_completed
  while ... do -> 循环
    ret = HG_Trigger(state.hg_context, 0, 1, &count)
    HG_Progress(state.hg_context, 100)
  ret = HG_Context_destroy(state.hg_context)
  hg_return_t err = HG_Finalize(state.hg_class)

----------

server:
mercury/05_bulk/server.c
./server addr
main
  HG_Init
  HG_Addr_self
  HG_Addr_to_string
  HG_Context_create
  HG_Register_data
  do
    HG_Trigger
    HG_Progress
  save
    HG_Bulk_transfer(stt->hg_context, save_bulk_completed, my_rpc_state, HG_BULK_PULL, info->addr, in.bulk_handle, 0, my_rpc_state->bulk_handle, 0, my_rpc_state->size, HG_OP_ID_IGNORE) origin and local 对齐 -> 服务端通过RDMA的单边读请求, 从客户端DMA拉取数据
  HG_Context_destroy
  HG_Finalize
---------- bulk ----------
HG_Forward
  ...
  hg_core_handle->ops.forward(hg_core_handle) -> hg_core_forward_na
    na_ret = NA_Msg_send_unexpected(hg_core_handle->na_class,
        hg_core_handle->na_context, hg_core_send_input_cb, hg_core_handle,
        hg_core_handle->core_handle.in_buf, hg_core_handle->in_buf_used,
        hg_core_handle->in_buf_plugin_data, hg_core_handle->na_addr,
        hg_core_handle->core_handle.info.context_id, hg_core_handle->tag,
        hg_core_handle->na_send_op_id);
      ...
      na_ofi_msg_send_unexpected(na_class_t *na_class -> na_ofi_msg_send
        rc = fi_senddata(ep, msg_info->buf.const_ptr, msg_info->buf_size,
        msg_info->desc, msg_info->tag & NA_OFI_TAG_MASK, msg_info->fi_addr,
        context);



ofi verbs;ofi_rxm://192.169.29.63:55555, class_name:ofi, protocol_name:verbs
HG_Core_init_opt2
hg_core_init
NA_Initialize_opt -> Initialize NA if not provided externally
  na_plugin_check_protocol
    na_ofi_check_protocol(const char *protocol_name)
      uint32_t runtime_version = fi_version()
      FI_MAJOR(runtime_version), FI_MINOR(runtime_version)
      type = na_ofi_prov_name_to_type(protocol_name) -> static const char *const na_ofi_prov_name[] = {NA_OFI_PROV_TYPES} -> 网络配置,插件, 协议等
      na_ofi_getinfo(type, NULL, &providers)
        // 设置网络参数(提示信息,过滤提供者)
        hints = fi_allocinfo()
        hints->mode = FI_ASYNC_IOV -> FI_ASYNC_IOV 模式指示应用程序必须提供 IO 向量所需的缓冲。 设置后,应用程序不得修改长度 > 1 的 IO 向量,包括任何相关的内存描述符数组,直到关联操作完成
        hints->ep_attr->type = FI_EP_RDM -> 设置端点类型为: 可靠但是连接更少的数据报类型
        hints->caps = FI_MSG | FI_TAGGED | FI_RMA | FI_DIRECTED_RECV
        hints->tx_attr->msg_order = FI_ORDER_SAS -> msg_order:保证具有相同标签的消息是有序的。 (FI_ORDER_SAS - 发送后发送(send after send)。如果设置,消息发送操作(包括标记的发送)将按照相对于其他消息发送的提交顺序进行传输。如果未设置,则消息发送可能会不按其提交的顺序进行传输)
        hints->tx_attr->op_flags = FI_INJECT_COMPLETE -> 当可以安全地重用缓冲区时生成完成事件
        hints->domain_attr->av_type = FI_AV_MAP
        hints->domain_attr->resource_mgmt = FI_RM_ENABLED
        hints->fabric_attr->prov_name = strdup(na_ofi_prov_name[prov_type]) -> 提供者: verbs;ofi_rxm
        hints->mode |= FI_CONTEXT
        hints->ep_attr->protocol = (uint32_t) na_ofi_prov_ep_proto[prov_type]
        hints->caps |= na_ofi_prov_extra_caps[prov_type]
        hints->domain_attr->control_progress = na_ofi_prov_progress[prov_type]
        hints->domain_attr->data_progress = na_ofi_prov_progress[prov_type]
        rc = fi_getinfo(NA_OFI_VERSION, node, service, flags, hints, fi_info_p)
      fi_freeinfo(providers)
    ret = na_private_class->na_class.ops->initialize( &na_private_class->na_class, na_info, listen) -> na_ofi_initialize(
      na_ofi_prov_addr_format(prov_type, na_init_info.addr_format)
      info.thread_mode = ((na_init_info.thread_mode & NA_THREAD_MODE_SINGLE)
      na_ofi_class = na_ofi_class_alloc();            
      ret = na_ofi_verify_info
      na_ofi_class->msg_recv_unexpected == na_ofi_msg_recv
      na_ofi_class->opt_features |= NA_OPT_MULTI_RECV
      na_ofi_class->cq_poll = na_ofi_cq_poll_no_source
      na_ofi_fabric_open(prov_type, na_ofi_class->fi_info->fabric_attr, &na_ofi_class->fabric)
      na_ofi_domain_open(na_ofi_class->fabric, na_init_info.auth_key,
        no_wait, na_ofi_prov_flags[prov_type] & NA_OFI_SEP,
        na_ofi_prov_flags[prov_type] & NA_OFI_DOM_SHARED, na_ofi_class->fi_info,
        &na_ofi_class->domain)
      na_ofi_class->context_max = na_init_info.max_contexts
      na_ofi_class->use_sep
      na_ofi_endpoint_open(na_ofi_class->fabric, na_ofi_class->domain,
        na_ofi_class->no_wait, na_ofi_class->use_sep, na_ofi_class->context_max,
        na_init_info.max_unexpected_size, na_init_info.max_expected_size,
        na_ofi_class->fi_info, &na_ofi_class->endpoint)
      pool_chunk_size
      na_ofi_class->send_pool = hg_mem_pool_create(pool_chunk_size
      na_ofi_class->recv_pool = hg_mem_pool_create(pool_chunk_size
      for (i = 0; i < NA_OFI_ADDR_POOL_COUNT; i  ) -> 地址池
        struct na_ofi_addr *na_ofi_addr = na_ofi_addr_alloc(na_ofi_class)
        HG_QUEUE_PUSH_TAIL(&na_ofi_class->addr_pool.queue, na_ofi_addr, entry)      
      ret = na_ofi_endpoint_get_src_addr(na_ofi_class)
      



fi_getinfo
ret = prov->provider->getinfo(version, node, service, flags, hints, &cur); -> static int rxm_getinfo
  ofi_is_wildcard_listen_addr
  rxm_validate_atomic_hints(hints)
  ofix_getinfo(version, node, service, flags, &rxm_util_prov, hints, rxm_info_to_core, rxm_info_to_rxm, info)
    ...
    prov_mode = ofi_mr_get_prov_mode(api_version, user_info, prov_info)
      prov_mode = ofi_cap_mr_mode(user_info->caps, prov_mode)
      
  rxm_alter_info(hints, *info)
ofi_set_prov_attr(tail->fabric_attr, prov->provider)

发送数据
fi_senddata -> rxm_senddata
  rxm_get_conn(rxm_ep, dest_addr, &rxm_conn)
    ...
    rxm_conn_progress(ep)
      ret = fi_eq_read(ep->msg_eq, &event, &cm_entry -> do loop -> vrb_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags)
        eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid) -> 事件队列, 参考结构: struct vrb_eq {
        ret = vrb_eq_read_event(eq, event, buf, len, flags)
          if (dlistfd_empty(&eq->list_head)) -> 读eq队列
        rdma_get_cm_event(eq->channel, &cma_event);
        vrb_eq_cm_process_event(eq, cma_event, event, buf, len) -> VERBS模块, 从事件队列中处理CM事件
          RDMA_CM_EVENT_ROUTE_RESOLVED -> 客户端路由解析事件
          ep->state = VRB_CONNECTING
          rdma_connect(ep->id, &ep->conn_param) -> 主动发起连接请求(建连) -> int rdma_connect (struct rdma_cm_id *id, struct rdma_conn_param *conn_param) -> 
          rdma_ack_cm_event(cma_event)
      rxm_handle_event(ep, event, &cm_entry, ret)
        rxm_process_connect(cm_entry)
          domain->flow_ctrl_ops->enable(conn->msg_ep, conn->ep->msg_info->rx_attr->size / 2) -> 流控 -> static int vrb_enable_ep_flow_ctrl
            ep->peer_rq_credits = 1   ep->saved_peer_rq_credits
            vrb_ep2_domain(ep)->send_credits
        conn->state = RXM_CM_CONNECTED -> 已建连
  rxm_send_common -> rxm_send_common(struct rxm_ep *rxm_ep,
    data_len = ofi_total_iov_len(iov, count)
    assert(count <= rxm_ep->rxm_info->tx_attr->iov_limit) -> 限制4个iov
    iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device)
    if (data_len <= rxm_ep->eager_limit) -> 紧急数据(小数据)16KB=16384B, data_len=184
      rxm_send_eager(rxm_ep, rxm_conn, iov, desc, count,
        fi_send(rxm_conn->msg_ep, &eager_buf->pkt, total_len  -> vrb_msg_ep_send
          struct ibv_send_wr wr = {
            .wr_id = VERBS_COMP(ep, (uintptr_t)context),
            .opcode = IBV_WR_SEND,
            .send_flags = VERBS_INJECT(ep, len, desc),
          };
          vrb_send_buf
            struct ibv_sge sge = vrb_init_sge(buf, len, desc);
            wr->sg_list = &sge;
            wr->num_sge = 1;
            vrb_post_send(ep, wr, 0)
              if (!ep->sq_credits || !ep->peer_rq_credits) -> 流控
                vrb_flush_cq(cq)
                  vrb_poll_cq(cq, &wc)
                    ret = ibv_poll_cq(cq->cq, 1, wc)
                  vrb_report_wc(cq, &wc)    
                    (void) ofi_cq_write(&cq->util_cq, (void *) (uintptr_t) wc->wr_id,flags, len, NULL, data, 0)
                      ofi_cirque_commit(cq->cirq) -> 环形链表计数器 1
              ep->sq_credits--
              ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr)

buf转sge:
#define vrb_init_sge(buf, len, desc) (struct ibv_sge)	
	{ .addr = (uintptr_t) buf,			
	  .length = (uint32_t) len,			
	  .lkey = (desc) ? ((struct vrb_mem_desc *) (desc))->lkey : 0 }









static int rxm_send_connect(struct rxm_conn *conn)
static int rxm_open_conn
  ret = fi_endpoint(domain->msg_domain, msg_info, &msg_ep, conn)
  int vrb_open_ep -> vrb_get_port_space -> return RDMA_PS_TCP
    int vrb_create_ep
      rdma_create_id
      rdma_resolve_addr -> TODO 也将此调用转换为非阻塞(使用事件通道):在运行大型 MPI 作业时,这可能是为了更好地扩展所需要的。 使其成为非阻塞意味着我们无法在 EP 启用时创建 QP。 在使用 rdma_create_qp 创建 QP 之前,我们需要等待 RDMA_CM_EVENT_ADDR_RESOLVED 事件。 它还需要一个SW接收队列来存储启用EP后应用程序发布的recvs
    ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops
    ep->util_ep.ep_fid.ops = &vrb_ep_base_ops
  fi_ep_bind(msg_ep, &ep->msg_eq->fid, 0) -> static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
    ofi_ep_bind_valid(&vrb_prov, bfid, flags)
    rdma_migrate_id(ep->id, ep->eq->channel)
  rxm_bind_comp(ep, msg_ep)  
    fi_ep_bind(msg_ep, &ep->msg_cq->fid, FI_TRANSMIT | FI_RECV)
      ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags)
        return fid_list_insert(&cq->ep_list
  fi_enable(msg_ep); -> static int vrb_ep_enable(struct fid_ep *ep_fid)
    vrb_msg_ep_get_qp_attr(ep, &attr)
    ret = rdma_create_qp(ep->id, domain->pd, &attr)
  conn->flow_ctrl = domain->flow_ctrl_ops->available(msg_ep) -> static bool vrb_flow_ctrl_available
  ret = rxm_prepost_recv(ep, msg_ep) -> ep->msg_info->rx_attr->size = 128
    rx_buf = ofi_buf_alloc(rxm_ep->rx_pool)
      if (ofi_bufpool_grow(pool))
	      ret = ofi_bufpool_region_alloc(buf_region);
    rxm_post_recv(rx_buf)
      ret = (int) fi_recv(rx_buf->rx_ep, &rx_buf->pkt -> vrb_msg_ep_recv(struct fid_ep *ep_fid
        struct ibv_recv_wr wr = {
          .wr_id = (uintptr_t)context,
          .num_sge = 1,
          .sg_list = &sge,
          .next = NULL,
        };
       ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr)
         ret = ibv_post_recv(ep->ibv_qp, wr, &bad_wr)
  conn->msg_ep = msg_ep
ret = rxm_init_connect_data(conn, &cm_data)
  ret = fi_getopt(&conn->ep->msg_pep->fid, FI_OPT_ENDPOINT
  cm_data->connect.port = ofi_addr_get_port(&conn->ep->addr.sa)
  cm_data->connect.client_conn_id = rxm_conn_id(conn->peer->index) -> rx_size=128
fi_connect(conn->msg_ep, info->dest_addr, &cm_data, sizeof(cm_data)) -> vrb_msg_ep_connect(struct fid_ep *ep_fid,
  vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr)
  vrb_ep_prepare_rdma_cm_param(&ep->conn_param,
    conn_param->responder_resources = RDMA_MAX_RESP_RES
    conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH
    conn_param->flow_control = 1;
	  conn_param->rnr_retry_count = 7; -> 无限重试
  ep->conn_param.retry_count = 15
  rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT) -> 触发CM事件: RDMA_CM_EVENT_ROUTE_RESOLVED

rxm_ep_settings_init




rdma write, 单边写, 
NA_Put
na_ofi_put
na_ofi_rma fi_writemsg = fi_rma_op   .writemsg =    -> include/rdma/fi_rma.h -> fi_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags)
ep->rma->writemsg(ep, msg, flags)
.writemsg = vrb_msg_ep_rma_writemsg
vrb_msg_ep_rma_writemsg -> prov/verbs/src/verbs_rma.c
  struct ibv_send_wr wr
  wr.opcode = IBV_WR_RDMA_WRITE | wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM 写|立即数写
  vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc -> ssize_t vrb_send_iovli
    wr->sg_list = alloca(sizeof(*wr->sg_list) * count)
    wr->sg_list[i].addr = (uintptr_t) iov[i].iov_base
    wr->sg_list[i].length = iov[i].iov_len;
    wr->send_flags = IBV_SEND_INLINE ?IBV_SEND_FENCE
    wr->sg_list[0]
    wr->num_sge =
    vrb_post_send(ep, wr, flags) -> prov/verbs/src/verbs_ep.c -> ssize_t vrb_post_send
      ibv_post_send(ep->ibv_qp, wr, &bad_wr)

所有标签(Flags), include/rdma/fabric.h -> #define FI_MSG			(1ULL << 1) ...


服务端执行bulk拉取
HG_Bulk_transfer(hg_context_t *context
  hg_bulk_transfer_na(hg_bulk_op_t op
    case HG_BULK_PULL:
      na_bulk_op = hg_bulk_na_get
      na_ret = na_bulk_op(hg_bulk_op_id->na_class -> hg_bulk_na_get -> return NA_Get -> na_class->ops->get -> na_ofi_get(na_class_t *na_class
        na_ofi_rma_common(NA_OFI_CLASS(na_class), context, NA_CB_GET,
        callback, arg, fi_readmsg, "fi_readmsg", FI_COMPLETION,
        (struct na_ofi_mem_handle *) local_mem_handle, local_offset,
        (struct na_ofi_mem_handle *) remote_mem_handle, remote_offset, length,
        (struct na_ofi_addr *) remote_addr, remote_id,
        (struct na_ofi_op_id *) op_id) -> 服务端拉取数据(READ)
        ..
na_ofi_rma_common(struct na_ofi_class *na_ofi_class,
  struct iovec *local_iov = NA_OFI_IOV
  void *local_desc = fi_mr_desc(na_ofi_mem_handle_local->fi_mr)
  na_ofi_iov_translate        
  na_ofi_rma_iov_translate
  na_ofi_rma_post(na_ofi_context->fi_tx, rma_info, &na_ofi_op_id->fi_ctx)
    rc = rma_info->fi_rma_op -> rxm_ep_readmsg(struct fid_ep *ep_fid,
      return rxm_ep_rma_common(rxm_ep, msg, flags | rxm_ep->util_ep.tx_msg_flags, fi_readmsg, FI_READ);
        rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc, msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ), rma_buf)
          fi_mr_desc(((struct rxm_mr *) desc[i])->msg_mr)
          rma_msg(rxm_conn->msg_ep, &msg_rma, flags) -> vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid
            struct ibv_send_wr wr = {
              .wr_id = VERBS_COMP_READ_FLAGS(ep, flags, (uintptr_t)msg->context),
              .opcode = IBV_WR_RDMA_READ,
              .wr.rdma.remote_addr = msg->rma_iov->addr,
              .wr.rdma.rkey = (uint32_t)msg->rma_iov->key,
              .num_sge = msg->iov_count,
            };
            vrb_post_send(ep, &wr, 0)
              vrb_flush_cq(cq)
              ep->sq_credits--
              ctx->op_queue = VRB_OP_SQ
              ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr) -> 提交WR,将客户端数据读过来(DMA)
              wr->wr_id = (uintptr_t) ctx->user_ctx
              slist_insert_tail(&ctx->entry, &ep->sq_list) -> 触发 vrb_flush_sq 遍历该发送队列 -> while (!slist_empty(&ep->sq_list))


下刷发送队列调用栈
(gdb) bt
#0  vrb_flush_sq (ep=0xb7a4d0) at prov/verbs/src/verbs_ep.c:456
#1  0x00007ffff7339f90 in vrb_ep_close (fid=0xb7a4d0) at prov/verbs/src/verbs_ep.c:582
#2  0x00007ffff7349804 in fi_close (fid=0xb7a4d0) at ./include/rdma/fabric.h:631
#3  0x00007ffff734a605 in rxm_close_conn (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:88
#4  0x00007ffff734cd88 in rxm_process_shutdown (conn=0xb756b8) at prov/rxm/src/rxm_conn.c:768
#5  0x00007ffff734d170 in rxm_handle_event (ep=0x78b570, event=3, cm_entry=0x7fffffffd3d0, len=16) at prov/rxm/src/rxm_conn.c:830
#6  0x00007ffff734d279 in rxm_conn_progress (ep=0x78b570) at prov/rxm/src/rxm_conn.c:850
#7  0x00007ffff736256e in rxm_ep_do_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1843
#8  0x00007ffff736268a in rxm_ep_progress (util_ep=0x78b570) at prov/rxm/src/rxm_cq.c:1863
#9  0x00007ffff72d5b7d in ofi_cq_progress (cq=0x787770) at prov/util/src/util_cq.c:498
#10 0x00007ffff72d51c0 in ofi_cq_readfrom (cq_fid=0x787770, buf=0x0, count=0, src_addr=0x0) at prov/util/src/util_cq.c:257
#11 0x00007ffff72d3a56 in fi_cq_readfrom (cq=0x787770, buf=0x0, count=0, src_addr=0x0) at ./include/rdma/fi_eq.h:400
#12 0x00007ffff72d5207 in ofi_cq_read (cq_fid=0x787770, buf=0x0, count=0) at prov/util/src/util_cq.c:264
#13 0x00007ffff72dbf4f in fi_cq_read (cq=0x787770, buf=0x0, count=0) at ./include/rdma/fi_eq.h:394
#14 0x00007ffff72dc4cc in util_poll_run (poll_fid=0x783390, context=0x7fffffffdb78, count=1) at prov/util/src/util_poll.c:95
#15 0x00007ffff72dc9a0 in fi_poll (pollset=0x783390, context=0x7fffffffdb78, count=1) at ./include/rdma/fi_eq.h:335
#16 0x00007ffff72de0bf in util_wait_fd_try (wait=0x783290) at prov/util/src/util_wait.c:366
#17 0x00007ffff72dd456 in ofi_trywait (fabric=0x781b40, fids=0x7fffffffdc40, count=1) at prov/util/src/util_wait.c:72
#18 0x00000000004266de in fi_trywait (count=1, fids=0x7fffffffdc40, fabric=<optimized out>) at /usr/local/include/rdma/fi_eq.h:323
#19 na_ofi_poll_try_wait (na_class=<optimized out>, context=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:8229
#20 0x0000000000417b4a in hg_core_poll_try_wait (context=0x786c80) at /home/xb/project/mercury/src/mercury_core.c:5058
#21 hg_core_progress (context=context@entry=0x786c80, timeout_ms=100) at /home/xb/project/mercury/src/mercury_core.c:5001
#22 0x000000000042043e in HG_Core_progress (context=0x786c80, timeout_ms=timeout_ms@entry=100) at /home/xb/project/mercury/src/mercury_core.c:6530
#23 0x000000000040bf02 in HG_Progress (context=<optimized out>, timeout=timeout@entry=100) at /home/xb/project/mercury/src/mercury.c:2178
#24 0x0000000000407741 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:69              




创建RDMA事件通道调用栈
#0  0x00007ffff6c966d0 in rdma_create_event_channel () from /lib64/librdmacm.so.1
#1  0x00007ffff6c967c5 in ucma_alloc_id () from /lib64/librdmacm.so.1
#2  0x00007ffff6c96821 in rdma_create_id2.part.20 () from /lib64/librdmacm.so.1
#3  0x00007ffff6c96476 in ucma_init () from /lib64/librdmacm.so.1
#4  0x00007ffff6c9697a in rdma_create_id () from /lib64/librdmacm.so.1
#5  0x00007ffff7334ebe in vrb_ifa_rdma_info (ifa=0x6784b0, dev_name=0x7fffffffd428, rai=0x7fffffffd420) at prov/verbs/src/verbs_info.c:955
#6  0x00007ffff73359e6 in vrb_getifaddrs (verbs_devs=0x7ffff776aaa0 <verbs_devs>) at prov/verbs/src/verbs_info.c:1177
#7  0x00007ffff7336048 in vrb_init_info (all_infos=0x7ffff776a5e8 <vrb_util_prov 8>) at prov/verbs/src/verbs_info.c:1378
#8  0x00007ffff73379c9 in vrb_getinfo (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd600) at prov/verbs/src/verbs_info.c:1872
#9  0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x66a4c0, info=0x7fffffffd750) at src/fabric.c:1282
#10 0x00007ffff72c70c0 in ofi_get_core_info (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, util_hints=0x6681e0, base_attr=0x7ffff776b8e0 <rxm_verbs_info>, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, core_info=0x7fffffffd750)
    at prov/util/src/util_attr.c:305
#11 0x00007ffff72c71da in ofix_getinfo (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, hints=0x6681e0, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, info_to_util=0x7ffff7344ff7 <rxm_info_to_rxm>, info=0x7fffffffd8c0)
    at prov/util/src/util_attr.c:328
#12 0x00007ffff7345d68 in rxm_getinfo (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd8c0) at prov/rxm/src/rxm_init.c:557
#13 0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=0, hints=0x6681e0, info=0x7fffffffd9c0) at src/fabric.c:1282
--------------------------
#14 0x000000000042a22f in na_ofi_getinfo (prov_type=prov_type@entry=NA_OFI_PROV_VERBS_RXM, info=info@entry=0x0, fi_info_p=fi_info_p@entry=0x7fffffffd9c0) at /home/xb/project/mercury/src/na/na_ofi.c:3247
#15 0x00000000004313ee in na_ofi_check_protocol (protocol_name=protocol_name@entry=0x6681a0 "verbs;ofi_rxm") at /home/xb/project/mercury/src/na/na_ofi.c:6679
#16 0x00000000004225ce in na_plugin_check_protocol (class_ops=0x44a2e0 <na_plugin_static_g>, ops_p=<synthetic pointer>, protocol_name=<optimized out>, class_name=0x668180 "ofi") at /home/xb/project/mercury/src/na/na.c:451
#17 NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:776
#18 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#19 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
    at /home/xb/project/mercury/src/mercury_core.c:1225
#20 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
    at /home/xb/project/mercury/src/mercury_core.c:5620
#21 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#22 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://175.16.53.61:55555", na_listen=na_listen@entry=1 '01') at /home/xb/project/mercury/src/mercury.c:1041
#23 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42


vrb_getifaddrs
  vrb_ifa_rdma_info
    rdma_getaddrinfo
    rdma_bind_addr
    *dev_name = strdup(ibv_get_device_name(id->verbs->device))




#0  0x00007ffff6c966d0 in rdma_create_event_channel () from /lib64/librdmacm.so.1
#1  0x00007ffff6c967c5 in ucma_alloc_id () from /lib64/librdmacm.so.1
#2  0x00007ffff6c96821 in rdma_create_id2.part.20 () from /lib64/librdmacm.so.1
#3  0x00007ffff7320cc0 in vrb_get_rai_id (node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, rai=0x7fffffffd4c8, id=0x7fffffffd4c0) at prov/verbs/src/verbs_init.c:278
#4  0x00007ffff733745a in vrb_handle_sock_addr (node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1770
#5  0x00007ffff733764c in vrb_get_match_infos (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, raw_info=0x7ffff776a5e8 <vrb_util_prov 8>, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1805
#6  0x00007ffff7337a01 in vrb_getinfo (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x7fffffffd650) at prov/verbs/src/verbs_info.c:1876
#7  0x00007ffff729730c in fi_getinfo_ (version=65549, node=0x0, service=0x0, flags=576460752303423488, hints=0x787fd0, info=0x78b700) at src/fabric.c:1282
#8  0x00007ffff72c70c0 in ofi_get_core_info (version=65549, node=0x0, service=0x0, flags=0, util_prov=0x7ffff776b960 <rxm_util_prov>, util_hints=0x784540, base_attr=0x0, info_to_core=0x7ffff73448f8 <rxm_info_to_core>, core_info=0x78b700) at prov/util/src/util_attr.c:305
#9  0x00007ffff7353f15 in rxm_open_core_res (ep=0x78b560) at prov/rxm/src/rxm_ep.c:1748
#10 0x00007ffff73545b3 in rxm_endpoint (domain=0x788450, info=0x6683b0, ep_fid=0x670ad0, context=0x0) at prov/rxm/src/rxm_ep.c:1925
#11 0x0000000000434563 in fi_endpoint (context=0x0, ep=0x670ad0, info=0x6683b0, domain=<optimized out>) at /usr/local/include/rdma/fi_endpoint.h:178
#12 na_ofi_basic_ep_open (na_ofi_endpoint=0x670ad0, no_wait=false, fi_info=0x6683b0, na_ofi_domain=0x780ca0, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4385
#13 na_ofi_endpoint_open (na_ofi_endpoint_p=0x66a4f0, fi_info=0x6683b0, expected_msg_size_max=0, unexpected_msg_size_max=0, max_contexts=<optimized out>, sep=false, no_wait=false, na_ofi_domain=0x780ca0, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4357
#14 na_ofi_initialize (na_class=<optimized out>, na_info=<optimized out>, listen=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:6867
#15 0x00000000004227cb in NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:819
#16 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#17 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
    at /home/xb/project/mercury/src/mercury_core.c:1225
#18 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
    at /home/xb/project/mercury/src/mercury_core.c:5620
#19 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#20 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01') at /home/xb/project/mercury/src/mercury.c:1041
#21 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42




rdma_listen: 服务端RDMA监听, 调用栈
#0  0x00007ffff6c95dd0 in rdma_listen () from /lib64/librdmacm.so.1
#1  0x00007ffff7324566 in vrb_pep_listen (pep_fid=0x783dd0) at prov/verbs/src/verbs_cm.c:535
#2  0x00007ffff7349ea9 in fi_listen (pep=0x783dd0) at ./include/rdma/fi_cm.h:91
#3  0x00007ffff734dcee in rxm_start_listen (ep=0x78b560) at prov/rxm/src/rxm_conn.c:1009
#4  0x00007ffff7353959 in rxm_ep_ctrl (fid=0x78b560, command=6, arg=0x0) at prov/rxm/src/rxm_ep.c:1607
#5  0x00000000004346cf in fi_enable (ep=<optimized out>) at /usr/local/include/rdma/fi_endpoint.h:217
#6  na_ofi_basic_ep_open (na_ofi_endpoint=0x670ad0, no_wait=false, fi_info=0x6683b0, na_ofi_domain=<optimized out>, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4427
#7  na_ofi_endpoint_open (na_ofi_endpoint_p=0x66a4f0, fi_info=0x6683b0, expected_msg_size_max=0, unexpected_msg_size_max=0, max_contexts=<optimized out>, sep=false, no_wait=false, na_ofi_domain=<optimized out>, na_ofi_fabric=0x7836b0) at /home/xb/project/mercury/src/na/na_ofi.c:4357
#8  na_ofi_initialize (na_class=<optimized out>, na_info=<optimized out>, listen=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:6867
#9  0x00000000004227cb in NA_Initialize_opt2 (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, version=version@entry=262144, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:819
#10 0x00000000004229d1 in NA_Initialize_opt (info_string=info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", listen=<optimized out>, na_init_info=na_init_info@entry=0x7fffffffdaf0) at /home/xb/project/mercury/src/na/na.c:734
#11 0x0000000000413378 in hg_core_init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0, class_p=class_p@entry=0x7fffffffdcb8)
    at /home/xb/project/mercury/src/mercury_core.c:1225
#12 0x000000000041bce1 in HG_Core_init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info=hg_init_info@entry=0x0)
    at /home/xb/project/mercury/src/mercury_core.c:5620
#13 0x000000000040967f in HG_Init_opt2 (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01', version=version@entry=0, hg_init_info_p=hg_init_info_p@entry=0x0) at /home/xb/project/mercury/src/mercury.c:1100
#14 0x000000000040987d in HG_Init (na_info_string=na_info_string@entry=0x7fffffffe1db "ofi verbs;ofi_rxm://172.17.29.63:55555", na_listen=na_listen@entry=1 '01') at /home/xb/project/mercury/src/mercury.c:1041
#15 0x0000000000407681 in main (argc=<optimized out>, argv=<optimized out>) at /home/xb/project/mercury/Examples/src/server.c:42



(gdb) 发送端解析地址调用栈
#0  0x00007ffff6c97780 in rdma_resolve_addr () from /lib64/librdmacm.so.1
#1  0x00007ffff73211be in vrb_create_ep (ep=0xa42010, ps=RDMA_PS_TCP, id=0xa42200) at prov/verbs/src/verbs_init.c:346
#2  0x00007ffff733c492 in vrb_open_ep (domain=0x783730, info=0x793e00, ep_fid=0x7fffffffd790, context=0xa3ccb8) at prov/verbs/src/verbs_ep.c:1210
#3  0x00007ffff7349da7 in fi_endpoint (domain=0x783730, info=0x793e00, ep=0x7fffffffd790, context=0xa3ccb8) at ./include/rdma/fi_endpoint.h:178
#4  0x00007ffff734ad73 in rxm_open_conn (conn=0xa3ccb8, msg_info=0x793e00) at prov/rxm/src/rxm_conn.c:189
#5  0x00007ffff734b455 in rxm_send_connect (conn=0xa3ccb8) at prov/rxm/src/rxm_conn.c:289
#6  0x00007ffff734b5fb in rxm_connect (conn=0xa3ccb8) at prov/rxm/src/rxm_conn.c:321
#7  0x00007ffff734bcd3 in rxm_get_conn (ep=0x786300, addr=1, conn=0x7fffffffd928) at prov/rxm/src/rxm_conn.c:466
#8  0x00007ffff7357dba in rxm_senddata (ep_fid=0x786300, buf=0x7ffff7edb030, len=156, desc=0x7934b0, data=1, dest_addr=1, context=0xa3c310) at prov/rxm/src/rxm_msg.c:848
#9  0x000000000042762c in fi_senddata (context=0xa3c310, dest_addr=<optimized out>, data=<optimized out>, desc=<optimized out>, len=<optimized out>, buf=<optimized out>, ep=0x786300) at /usr/local/include/rdma/fi_endpoint.h:343
#10 na_ofi_msg_send (ep=0x786300, msg_info=0xa3c138, context=0xa3c310) at /home/xb/project/mercury/src/na/na_ofi.c:5065
#11 0x000000000042d088 in na_ofi_msg_send_unexpected (na_class=<optimized out>, context=<optimized out>, callback=<optimized out>, arg=0xa3bc50, buf=<optimized out>, buf_size=<optimized out>, plugin_data=<optimized out>, dest_addr=<optimized out>, 
    dest_id=<optimized out>, tag=<optimized out>, op_id=<optimized out>) at /home/xb/project/mercury/src/na/na_ofi.c:7554
#12 0x0000000000414b8e in NA_Msg_send_unexpected (op_id=<optimized out>, tag=<optimized out>, dest_id=<optimized out>, dest_addr=<optimized out>, plugin_data=<optimized out>, buf_size=<optimized out>, buf=<optimized out>, arg=0xa3bc50, 
    callback=0x41a620 <hg_core_send_input_cb>, context=<optimized out>, na_class=<optimized out>) at /home/xb/project/mercury/src/na/na.h:1140
#13 hg_core_forward_na (hg_core_handle=0xa3bc50) at /home/xb/project/mercury/src/mercury_core.c:3949
#14 0x00000000004200d5 in hg_core_forward (payload_size=128, flags=0 '00', arg=0xa3c6c0, callback=0x408120 <hg_core_forward_cb>, hg_core_handle=0xa3bc50) at /home/xb/project/mercury/src/mercury_core.c:3887
#15 HG_Core_forward (handle=0xa3bc50, callback=callback@entry=0x408120 <hg_core_forward_cb>, arg=arg@entry=0xa3c6c0, flags=<optimized out>, payload_size=128) at /home/xb/project/mercury/src/mercury_core.c:6432
#16 0x000000000040bdad in HG_Forward (handle=0xa3c6c0, callback=callback@entry=0x4080a0 <save_completed>, arg=arg@entry=0x7fffffffdda0, in_struct=in_struct@entry=0x7fffffffdbb0) at /home/xb/project/mercury/src/mercury.c:2111
#17 0x000000000040807b in lookup_callback (callback_info=0x7fffffffdc00) at /home/xb/project/mercury/Examples/src/client.c:132
#18 0x0000000000408245 in hg_core_addr_lookup_cb (callback_info=<optimized out>) at /home/xb/project/mercury/src/mercury.c:437
#19 0x00000000004186d5 in hg_core_trigger_lookup_entry (hg_core_op_id=0x786260) at /home/xb/project/mercury/src/mercury_core.c:5387
#20 hg_core_trigger (context=0x78d750, timeout_ms=timeout_ms@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury_core.c:5339
#21 0x000000000042072f in HG_Core_trigger (context=<optimized out>, timeout=timeout@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury_core.c:6577
#22 0x000000000040c162 in HG_Trigger (context=<optimized out>, timeout=timeout@entry=0, max_count=max_count@entry=1, actual_count_p=actual_count_p@entry=0x7fffffffdd7c) at /home/xb/project/mercury/src/mercury.c:2197
#23 0x0000000000407896 in main (argc=<optimized out>, argv=0x7fffffffdec8) at /home/xb/project/mercury/Examples/src/client.c:78


hg_core_progress -> 中断模式和轮询模式自动切换
  if (hg_core_poll_try_wait(context)) { -> 如果完成队列不为空, 表示有网络事件, 则返回false, 开启轮询模式, 反之开启中断模式
    safe_wait = HG_TRUE; -> 中断模式
  if (safe_wait) {  -> 中断模式
      ret = hg_core_poll_wait(context, poll_timeout, &progressed);
          epoll_wait(poll_set->fd, poll_set->events, max_poll_events, (int) timeout)
      HG_CHECK_SUBSYS_HG_ERROR(poll, error, ret,
          "Could not make blocking progress on context");
  } else { -> 轮询模式
      ret = hg_core_poll(context, poll_timeout, &progressed);
      HG_CHECK_SUBSYS_HG_ERROR(poll, error, ret,
          "Could not make non-blocking progress on context");
  }

  
bulk_多段:
HG_BULK_REGV
...
hg_bulk_create
    hg_bulk->desc.info.flags |= HG_BULK_REGV
    hg_bulk_register_segments
        NA_Mem_handle_create_segments
            mem_handle_create_segments -> na_ofi_mem_handle_create_segments
                if (segment_count > NA_OFI_IOV_STATIC_MAX) -> 超过8段
                    na_ofi_mem_handle->desc.iov.d
                for (i = 0; i < segment_count; i  )
                    iov[i].iov_base = (void *) segments[i].base 

晓兵(ssbandjl)

博客: https://logread.cn | https://blog.csdn.net/ssbandjl | https://cloud.tencent.com/developer/user/5060293/articles

DAOS汇总: https://cloud.tencent.com/developer/article/2344030

晓兵技术杂谈(系列)

https://cloud.tencent.com/developer/user/5060293/video

0 人点赞