简介
UCT(Unified Communication Transport)是一个传输层,它抽象了各种硬件架构之间的差异,并提供了支持通信协议实现的低级 API。该层的主要目标是以最小的软件开销提供对硬件网络资源的直接有效的访问。为此,UCT 依赖于低级驱动程序,例如 uGNI、Verbs、共享内存、ROCM、CUDA。此外,该层还提供通信上下文管理(基于线程和应用程序级别, 如: ucs_async_context_create, uct_worker_create)以及设备特定存储器(包括加速器中的存储器)的分配和管理的构造。在通信 API 方面,UCT 定义了立即(短消息,如: uct_ep_am_short)、缓冲区复制发送(bcopy,如: uct_ep_am_bcopy)和零拷贝(zcopy, 如: uct_ep_am_zcopy)通信操作的接口。短操作针对可以就地发布和完成的小消息进行了优化。bcopy 操作针对通常通过所谓的弹跳缓冲区发送的中等大小的消息进行了优化。最后,zcopy 操作公开零复制内存到内存通信语义。
通信语义
- 远程内存访问
- put
- get
- 远程内存原子操作
- add
- fetch-and-add
- swap
- compare-and-swap
- 32/64 bit arguments
- active message 活动消息
- flush/fence 下刷/栅栏
核心功能
代码语言:javascript复制UCT对外API的头文件: src/uct/api/uct.h
PUT操作
uct_ep_put_short
uct_ep_put_bcopy
uct_ep_put_zcopy
GET操作
uct_ep_get_short
uct_ep_get_bcopy
uct_ep_get_zcopy
AM活动消息
uct_ep_am_short
uct_ep_am_short_iov
uct_ep_am_bcopy
uct_ep_am_zcopy
原子操作
uct_ep_atomic_cswap64
uct_ep_atomic_cswap32
uct_ep_atomic32_post
uct_ep_atomic64_post
uct_ep_atomic32_fetch
uct_ep_atomic64_fetch
TAG操作
uct_ep_tag_eager_short
uct_ep_tag_eager_bcopy
uct_ep_tag_eager_zcopy
uct_ep_tag_rndv_zcopy
uct_ep_tag_rndv_cancel
uct_ep_tag_rndv_request
uct_iface_tag_recv_zcopy
uct_iface_tag_recv_cancel
上下文和通信对象
- uct_md_h - 内存域对象。支持内存注册和分配, 以使用底层传输。
typedef struct uct_md *uct_md_h;
struct uct_md {
uct_md_ops_t *ops;
uct_component_t *component;
};
- uct_md_resource_desc_t、uct_tl_resource_desc_t - 保存有关当前进程可用资源的信息的结构。具有独特的属性,例如带宽、延迟、消息速率、CPU 位置。
typedef struct uct_md_resource_desc {
char md_name[UCT_MD_NAME_MAX]; /**< Memory domain name */
} uct_md_resource_desc_t;
// 资源描述符是表示网络资源的对象。 资源描述符可以表示独立的通信资源(例如HCA端口、网络接口)或多个资源(例如多个网络接口或通信端口)。 它还可以表示通过单个物理网络接口定义的虚拟通信资源
typedef struct uct_tl_resource_desc {
char tl_name[UCT_TL_NAME_MAX]; /**< Transport name */
char dev_name[UCT_DEVICE_NAME_MAX]; /**< Hardware device name */
uct_device_type_t dev_type; /**< The device represented by this resource
(e.g. UCT_DEVICE_TYPE_NET for a network interface) */
ucs_sys_device_t sys_device; /**< The identifier associated with the device
bus_id as captured in ucs_sys_bus_id_t struct */
} uct_tl_resource_desc_t;
- uct_worker_h - 对通信资源进行分组并拥有进度引擎(progress驱动)。显式驱动UCT时,需要传递worker对象。
typedef struct uct_worker *uct_worker_h;
typedef struct uct_worker {
ucs_callbackq_t progress_q;
} uct_worker_t;
struct ucs_callbackq {
/**
* Array of fast-path element, the last is reserved as a sentinel to mark
* array end.
*/
ucs_callbackq_elem_t fast_elems[UCS_CALLBACKQ_FAST_COUNT 1];
/**
* Private data, which we don't want to expose in API to avoid pulling
* more header files
*/
char priv[72];
};
- uct_iface_h - 给定工作线程上具有特定传输的特定设备上的通信资源。它具有唯一的网络地址并且可以(可能)与之建立连接。此外,它还保存一个活动消息表。
typedef struct uct_iface {
uct_iface_ops_t ops;
} uct_iface_t;
// 网络接口支持的函数操作表
typedef struct uct_iface_ops {
/* endpoint - put */
uct_ep_put_short_func_t ep_put_short;
uct_ep_put_bcopy_func_t ep_put_bcopy;
uct_ep_put_zcopy_func_t ep_put_zcopy;
/* endpoint - get */
uct_ep_get_short_func_t ep_get_short;
uct_ep_get_bcopy_func_t ep_get_bcopy;
uct_ep_get_zcopy_func_t ep_get_zcopy;
/* endpoint - active message */
uct_ep_am_short_func_t ep_am_short;
uct_ep_am_short_iov_func_t ep_am_short_iov;
uct_ep_am_bcopy_func_t ep_am_bcopy;
uct_ep_am_zcopy_func_t ep_am_zcopy;
/* endpoint - atomics */
uct_ep_atomic_cswap64_func_t ep_atomic_cswap64;
uct_ep_atomic_cswap32_func_t ep_atomic_cswap32;
uct_ep_atomic32_post_func_t ep_atomic32_post;
uct_ep_atomic64_post_func_t ep_atomic64_post;
uct_ep_atomic32_fetch_func_t ep_atomic32_fetch;
uct_ep_atomic64_fetch_func_t ep_atomic64_fetch;
/* endpoint - tagged operations */
uct_ep_tag_eager_short_func_t ep_tag_eager_short;
uct_ep_tag_eager_bcopy_func_t ep_tag_eager_bcopy;
uct_ep_tag_eager_zcopy_func_t ep_tag_eager_zcopy;
uct_ep_tag_rndv_zcopy_func_t ep_tag_rndv_zcopy;
uct_ep_tag_rndv_cancel_func_t ep_tag_rndv_cancel;
uct_ep_tag_rndv_request_func_t ep_tag_rndv_request;
/* interface - tagged operations */
uct_iface_tag_recv_zcopy_func_t iface_tag_recv_zcopy;
uct_iface_tag_recv_cancel_func_t iface_tag_recv_cancel;
/* endpoint - pending queue */
uct_ep_pending_add_func_t ep_pending_add;
uct_ep_pending_purge_func_t ep_pending_purge;
/* endpoint - synchronization */
uct_ep_flush_func_t ep_flush;
uct_ep_fence_func_t ep_fence;
uct_ep_check_func_t ep_check;
/* endpoint - connection establishment */
uct_ep_create_func_t ep_create;
uct_ep_connect_func_t ep_connect;
uct_ep_disconnect_func_t ep_disconnect;
uct_cm_ep_conn_notify_func_t cm_ep_conn_notify;
uct_ep_destroy_func_t ep_destroy;
uct_ep_get_address_func_t ep_get_address;
uct_ep_connect_to_ep_func_t ep_connect_to_ep;
uct_iface_accept_func_t iface_accept;
uct_iface_reject_func_t iface_reject;
/* interface - synchronization */
uct_iface_flush_func_t iface_flush;
uct_iface_fence_func_t iface_fence;
/* interface - progress control */
uct_iface_progress_enable_func_t iface_progress_enable;
uct_iface_progress_disable_func_t iface_progress_disable;
uct_iface_progress_func_t iface_progress;
/* interface - events */
uct_iface_event_fd_get_func_t iface_event_fd_get;
uct_iface_event_arm_func_t iface_event_arm;
/* interface - management */
uct_iface_close_func_t iface_close;
uct_iface_query_func_t iface_query;
/* interface - connection establishment */
uct_iface_get_device_address_func_t iface_get_device_address;
uct_iface_get_address_func_t iface_get_address;
uct_iface_is_reachable_func_t iface_is_reachable;
} uct_iface_ops_t;
- uct_ep_h
- 与远程对等点的连接。有两种创建方法:创建一个连接到远程接口的端点(通过其地址),或者创建一个端点,然后将其连接到远程端点(p2p 模式)。传输应至少支持这些方法中的一种,并在功能位中指示这一点。
示例:
- RC 可靠连接类型:通过qp代表连接
- UD 不可靠的数据报:支持地址句柄,可靠性状态
- DC 动态连接:支持地址句柄 (address handle)
- 共享内存:映射段(Mapped segment)
对在远程接口运行的保护域内注册的任何虚拟地址, 可执行远程内存访问(RMA)
typedef struct uct_ep {
uct_iface_h iface; // 与网络端口关联
} uct_ep_t;
排序语义
- 基于端点/传输的配置和硬件特性
- 排序属性暴露给上层
- 栅栏操作可用于插入命令执行
完成语义
有两种类型的完成:“本地完成”和“远程完成”。
- 远程完成: 远端已执行此操作
无法跟踪特定操作的远程完成情况。到目前为止,只能使用阻塞/非阻塞刷新来等待所有操作问题的远程完成。远程完成的确切语义取决于传输并作为其功能的一部分公开。例如
- RMA:远程内存已写入/数据已在 PCI 总线上调度
- AM:远程回调已开始/完成
- 本地完成: 用户缓冲区可以重用
- 显式非阻塞:用户将接受一个句柄,并且将在此句柄上发出完成信号。
- 隐式非阻塞:用户不会请求句柄,本地完成将由远程隐式完成。
- 指定发送完成回调的选项。回调线程安全语义与网络 AM 处理程序相同。调用后,该句柄将被库释放。
操作句柄分配
可能无法立即本地完成的通信 API 如下所示:
代码语言:javascript复制ucs_status_t uct_OPERATION(... , uct_completion_t *comp)
例如:
代码语言:javascript复制ucs_status_t uct_ep_put_zcopy(uct_ep_h ep, const void *buffer, size_t length,
uct_mem_h memh, uint64_t remote_addr,
uct_rkey_t rkey, uct_completion_t *comp)
- comp - 指向回调结构的指针,由用户分配,用于指示本地完成。用户应该使用计数器和回调来初始化该结构。UCT 在完成时递减计数器,并在计数器达到 0 时调用回调。同一个指针可以传递给多个通信函数。如果为NULL,则被忽略,此时需要使用flush来等待本地完成。
- 可能的返回值:
- UCS_OK - 操作在本地完成并且缓冲区可以重用。不返回请求句柄,并且忽略回调参数。
- UCS_INPROGRESS - 操作已开始,将在将来完成。如果 comp != NULL,则当已知本地完成时将调用回调。
- UCS_ERR_NO_RESOURCE - 现在无法启动该操作,但可以稍后启动。建议调用uct_worker_progress()后稍后重试。
使用示例:
代码语言:javascript复制status = api_call(..., &my_handle->comp);
if (likely(status == UCS_OK)) {
/* done */
} else if (status == UCS_INPROGRESS) {
/* started */
} else if (status == UCS_ERR_NO_RESOURCE) {
/* cannot be started now */
} else {
/* error */
}
排序 回调由最底层接口触发。传输可能不是本地排序的(这意味着 X 的完成并不意味着 0..X-1 的本地完成)。因此,高层/用户可能希望为每个片段设置回调。此外,还将进行单独的围栏操作。
活跃消息
- 用户将指定他的回调是否是线程安全的。如果不是,则传输必须仅在 API 的进度调用期间调用它,而不是从进度线程(如果存在)中调用它。
- 回调可以调用任何通信函数,但不能调用Progress。避免了递归,因为回调必须负责将所需的操作放入待处理队列,以防无法启动(返回 UCS_ERR_NO_RESOURCE)。
- 允许回调保留传递给它的数据,并通过返回 UCS_INPROGRESS 稍后释放它(例如使用意外标签)。
进度语义(progress)
- 工人有一个明确的进度函数。
- RMA 和 AMO 操作不需要在目的地侧显式调用进度。如果传输不支持 HW RMA/AMO,则应使用进度线程在 SW 中对其进行模拟。
线程安全
- 所有 API 函数都应该是线程安全的
- Interface(
uct_iface_h
)可以独立于不同的线程进行。 - 在编译时,可以指定以下之一:
- 不是线程安全的
- 粗粒度锁(每个上下文)
- 细粒度锁(尽最大努力从多个线程处理相同的上下文)
- 数据结构的线程安全:
- 每个数据结构都会有非线程安全版本
- 一些数据结构还有线程安全版本
- 在编译时,如果不是“细粒度”,线程安全版本将降级为非线程安全。
- 在使用数据结构时,开发人员可以使用线程安全版本作为细粒度锁定版本的一部分。
- 为了在运行时决定(ala MPI_Init_thread):
- 选项1:加载替代库版本(例如-mt)
- 选项2:为每个锁/原子添加运行时检查
内存处理
- 内存域支持分配/释放和注册/取消注册。
- 注册内存表示为
uct_mem_h
- 为了允许远程访问内存区域,用户必须获取打包的 rkey 并通过使用带外机制发送它。打包的 rkey 缓冲区是通过提供内存句柄获得的。
- 执行 RMA 的一方解压缓冲区,并获取一个
rkey_bundle_t
,其中包含 rkey asuct_rkey_t
,以及一个用于跟踪其资源使用情况的不透明指针。 - rkey 可直接用于 RMA。
- 内存域可以选择缓存注册,以降低其开销,或利用按需分页机制。
- 在 UCP 中,将有一个函数可以计算出使用多个传输注册内存的正确顺序。
数据规格
- 短内联:short - inline
- 缓冲区,长度。
- 公开支持的最大内联大小。
- 传输必须保证在编译时定义的最小大小 <CONSTANT> 字节。大约40字节。
- get() 不支持
- 复制:bcopy
- “pack”回调,上下文参数,长度
- memcpy() 可以作为包回调传递
- 大小限制由反弹缓冲区大小定义并在传输属性中公开。
- 零拷贝复制:zcopy
- 缓冲区、长度、内存句柄
- 数据必须以零副本形式发送。
- 本地密钥必须有效
- 单维分散/聚集 - iovec(可以是本地或远程)
- iovec 元素有:指针、长度、步幅、计数、键 / iovec len
- 密钥应该从 mmap 函数中获取。
- 传输公开了 iovec 中的最大条目数
- IB 实现说明:tl 将根据需要以正确的顺序发布 umr-s,并带有临时内存密钥。
- 原子 - 直接传递参数而无需本地密钥,因为复制结果的成本可以忽略不计。
连接建立
- 传输支持:
create_ep(iface) -> uct_ep_t
- 本地操作connect_ep_to_ep(uct_ep_t, remote_iface_addr, remote_ep_addr)
- 双方都得调用它 - 很可能是本地操作。connect_ep_to_iface(uct_ep_t, remote_iface_addr)
- 可选的传输能力 - 一侧 - 一侧会调用它就足够了。- 传输通过设置功能标志来公开它支持的内容
- DC 将仅使用 connect_to_iface()
- 主动消息回调实际上不必知道谁是发送者。仅用于标签匹配,在这种情况下,我们已经打包了发件人排名号。
- 可以在同一网络上下文上创建多个端点,并将它们连接到同一目标网络上下文的多个端点。每个本地端点可以具有唯一的“索引”/“标记”,它是地址的一部分。该信息将作为remote_ep_addr_blob的一部分进行交换。
运行时环境RTE
- 不会成为 API 的一部分。用户可以使用 RTE 来提供要连接到的地址 blob 的 UCT。
- 回调表
- 点对点语义(活动消息)
- 考虑运行时:slurm、alps、orte、stci、Hydra、lsf、torque、sge、ssh、rsh、oracle grid engine、pmi-x
UCT 服务端和客户端示例
代码语言:javascript复制代码位置: examples/uct_hello_world.c
编译: cd examples; make && ./uct_hello_world
服务端执行(指定RDMA网口和零拷贝模式):
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -z
客户端执行(指定RDMA网口和零拷贝模式, 以及服务端IP):
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63 -z
代码语言:javascript复制服务端日志:
export UCX_LOG_LEVEL=debug
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs
[root@node63 ucx]# ./s_uct.sh
[1696660940.924550] [node63:3375441:0] debug.c:1155 UCX DEBUG using signal stack 0x7faadc17a000 size 141824
[1696660940.925589] [node63:3375441:0] init.c:121 UCX DEBUG /home/xb/project/ucx/src/ucs/.libs/libucs.so.0 loaded at 0x7faadbd15000
[1696660940.925611] [node63:3375441:0] init.c:122 UCX DEBUG cmd line: /home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs
[1696660940.925624] [node63:3375441:0] module.c:72 UCX DEBUG ucs library path: /home/xb/project/ucx/src/ucs/.libs/libucs.so.0
[1696660940.925629] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for ucs
INFO: UCT_HELLO_WORLD AM function = uct_ep_am_short server = (null) port = 13337
[1696660940.925665] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for uct
[1696660940.926871] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ib98-0: PF sysfs path is '/sys/devices/pci0000:97/0000:97:04.0/0000:98:00.0'
[1696660940.926884] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 0 for bus id 98:00.0
[1696660940.926889] [node63:3375441:0] topo.c:475 UCX DEBUG ib98-0: bdf_name 0000:98:00.0 sys_dev 0
[1696660940.927070] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ib17-0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660940.927076] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 1 for bus id 17:00.0
[1696660940.927081] [node63:3375441:0] topo.c:475 UCX DEBUG ib17-0: bdf_name 0000:17:00.0 sys_dev 1
[1696660940.927754] [node63:3375441:0] topo.c:787 UCX DEBUG /sys/class/net/lo: sysfs path undetected
[1696660940.927758] [node63:3375441:0] topo.c:479 UCX DEBUG lo: system device unknown
[1696660940.928680] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ethA69-0: PF sysfs path is '/sys/devices/pci0000:68/0000:68:02.0/0000:69:00.0'
[1696660940.928685] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 2 for bus id 69:00.0
[1696660940.928689] [node63:3375441:0] topo.c:475 UCX DEBUG ethA69-0: bdf_name 0000:69:00.0 sys_dev 2
[1696660940.928787] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for uct_ib
[1696660940.933874] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/infiniband/mlx5_0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660940.933885] [node63:3375441:0] topo.c:475 UCX DEBUG mlx5_0: bdf_name 0000:17:00.0 sys_dev 1
[1696660940.933906] [node63:3375441:0] ib_device.c:487 UCX DEBUG mlx5_0: vendor_id 0x15b3 device_id 4117
[1696660940.934468] [node63:3375441:0] ib_mlx5dv_md.c:1264 UCX DEBUG mlx5_0: crossing_vhca_mkey is not supported
[1696660940.934661] [node63:3375441:0] ib_mlx5dv_md.c:880 UCX DEBUG mlx5_0: ODP is disabled because version 1 is not supported for DevX QP
[1696660940.934883] [node63:3375441:0] async.c:232 UCX DEBUG added async handler 0x172b320 [id=4 ref 1] ???() to hash
[1696660940.934952] [node63:3375441:0] async.c:494 UCX DEBUG listening to async event fd 4 events 0x1 mode thread_spinlock
[1696660940.934958] [node63:3375441:0] ib_device.c:586 UCX DEBUG initialized device 'mlx5_0' (InfiniBand channel adapter) with 1 ports
[1696660940.934968] [node63:3375441:0] ib_md.c:1115 UCX DEBUG mlx5_0: cuda GPUDirect RDMA is disabled
[1696660940.934974] [node63:3375441:0] ib_md.c:1115 UCX DEBUG mlx5_0: rocm GPUDirect RDMA is disabled
[1696660940.934985] [node63:3375441:0] ib_md.c:1140 UCX DEBUG mlx5_0: dmabuf is supported
[1696660940.934992] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool devx dbrec: align 64, maxelems 4294967295, elemsize 40
[1696660940.935245] [node63:3375441:0] ib_mlx5dv_md.c:1341 UCX DEBUG mlx5_0: opened DEVX md log_max_qp=17
[1696660940.935251] [node63:3375441:0] ib_md.c:1103 UCX DEBUG mlx5_0: relaxed order memory access is disabled
[1696660940.935710] [node63:3375441:0] ib_mlx5dv_md.c:1011 UCX DEBUG created indirect rkey 0x9f00 for remote flush
[1696660940.935715] [node63:3375441:0] ib_md.c:1054 UCX DEBUG mlx5_0: md open by 'uct_ib_mlx5_devx_md_ops' is successful
[1696660940.935750] [node63:3375441:0] ib_device.c:1052 UCX DEBUG no compatible IB ports found for flags 0xc4
[1696660940.935755] [node63:3375441:0] uct_md.c:97 UCX DEBUG failed to query dc_mlx5 resources: No such device
[1696660940.937373] [node63:3375441:0] ib_iface.c:927 UCX DEBUG using pkey[0] 0xffff on mlx5_0:1/RoCE
[1696660940.937429] [node63:3375441:0] ib_device.c:916 UCX DEBUG mlx5_0:1 using gid_index 3
[1696660940.938579] [node63:3375441:0] ib_iface.c:1453 UCX DEBUG created uct_ib_iface_t headroom_ofs 12 payload_ofs 16 hdr_ofs 15 data_sz 8256
[1696660940.938616] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_recv_desc: align 64, maxelems 4294967295, elemsize 8279
[1696660940.938621] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_send_desc: align 64, maxelems 4294967295, elemsize 8328
[1696660940.938705] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool send-ops-mpool: align 64, maxelems 4294967295, elemsize 56
[1696660940.939137] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool pending-ops: align 1, maxelems 4294967295, elemsize 64
[1696660940.939146] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_verbs_short_desc: align 64, maxelems 4294967295, elemsize 200
[1696660940.939623] [node63:3375441:0] ib_iface.c:1052 UCX DEBUG iface=0x1732010: created RC QP 0x1a917 on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660940.945048] [node63:3375441:0] mpool.c:282 UCX DEBUG mpool rc_recv_desc: allocated chunk 0x7faad6a00018 of 37748712 bytes with 4537 elements
Using rc_verbs/mlx5_0:1
Waiting for connection...
[1696660984.949828] [node63:3375441:0] ib_iface.c:1052 UCX DEBUG iface=0x1732010: created RC QP 0x1a91b on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.949850] [node63:3375441:0] rc_ep.c:165 UCX DEBUG created rc ep 0x172ce60
[1696660984.950046] [node63:3375441:0] ib_iface.c:809 UCX DEBUG iface 0x1732010: ah_attr dlid=49152 sl=0 port=1 src_path_bits=0 dgid=::ffff:172.17.29.63 flow_label=0xffffffff sgid_index=3 traffic_class=106
[1696660984.950392] [node63:3375441:0] rc_iface.c:934 UCX DEBUG connected rc qp 0x1a91b on mlx5_0:1/RoCE to lid 49152( 0) sl 0 remote_qp 0x1a91a mtu 1024 timer 18x7 rnr 13x7 rd_atom 16
----- UCT TEST SUCCESS ----
[callback] uct_ep_am_short sent ABCDEFGHIJKLMNO (16 bytes)
---------------------------
----- UCT TEST SUCCESS ----
[main] uct_ep_am_short sent ABCDEFGHIJKLMNO (16 bytes)
---------------------------
[1696660984.951144] [node63:3375441:0] rc_ep.c:185 UCX DEBUG destroy rc ep 0x172ce60
[1696660984.951206] [node63:3375441:a] ib_device.c:468 UCX DEBUG IB Async event on mlx5_0: SRQ-attached QP 0x1a91b was flushed
[1696660984.952967] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_verbs_short_desc destroyed
[1696660984.953322] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool send-ops-mpool destroyed
[1696660984.953327] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_send_desc destroyed
[1696660984.953663] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_recv_desc destroyed
[1696660984.953668] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool pending-ops destroyed
[1696660984.954331] [node63:3375441:0] ib_mlx5dv_md.c:1399 UCX DEBUG mlx5_0: md=0x172d3f0 md->flags=0x3f01a3 flush_rkey=0x9f00
[1696660984.954898] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool devx dbrec destroyed
[1696660984.954907] [node63:3375441:0] ib_device.c:605 UCX DEBUG destroying ib device mlx5_0
[1696660984.954915] [node63:3375441:0] async.c:157 UCX DEBUG removed async handler 0x172b320 [id=4 ref 1] ???() from hash
[1696660984.954919] [node63:3375441:0] async.c:547 UCX DEBUG removing async handler 0x172b320 [id=4 ref 1] ???()
[1696660984.954971] [node63:3375441:0] async.c:172 UCX DEBUG release async handler 0x172b320 [id=4 ref 0] ???()
You have mail in /var/spool/mail/root
[root@node63 ucx]#
客户端日志:
export UCX_LOG_LEVEL=debug
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63
[root@node63 ucx]# ./c_uct.sh
[1696660984.917046] [node63:3385663:0] debug.c:1155 UCX DEBUG using signal stack 0x7fc7c70b0000 size 141824
[1696660984.929624] [node63:3385663:0] init.c:121 UCX DEBUG /home/xb/project/ucx/src/ucs/.libs/libucs.so.0 loaded at 0x7fc7c6c4b000
[1696660984.929650] [node63:3385663:0] init.c:122 UCX DEBUG cmd line: /home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63
[1696660984.929662] [node63:3385663:0] module.c:72 UCX DEBUG ucs library path: /home/xb/project/ucx/src/ucs/.libs/libucs.so.0
[1696660984.929668] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for ucs
INFO: UCT_HELLO_WORLD AM function = uct_ep_am_short server = 172.17.29.63 port = 13337
[1696660984.929709] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for uct
[1696660984.930933] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ib98-0: PF sysfs path is '/sys/devices/pci0000:97/0000:97:04.0/0000:98:00.0'
[1696660984.930947] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 0 for bus id 98:00.0
[1696660984.930951] [node63:3385663:0] topo.c:475 UCX DEBUG ib98-0: bdf_name 0000:98:00.0 sys_dev 0
[1696660984.931136] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ib17-0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660984.931142] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 1 for bus id 17:00.0
[1696660984.931146] [node63:3385663:0] topo.c:475 UCX DEBUG ib17-0: bdf_name 0000:17:00.0 sys_dev 1
[1696660984.931853] [node63:3385663:0] topo.c:787 UCX DEBUG /sys/class/net/lo: sysfs path undetected
[1696660984.931857] [node63:3385663:0] topo.c:479 UCX DEBUG lo: system device unknown
[1696660984.932818] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ethA69-0: PF sysfs path is '/sys/devices/pci0000:68/0000:68:02.0/0000:69:00.0'
[1696660984.932824] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 2 for bus id 69:00.0
[1696660984.932830] [node63:3385663:0] topo.c:475 UCX DEBUG ethA69-0: bdf_name 0000:69:00.0 sys_dev 2
[1696660984.932925] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for uct_ib
[1696660984.937340] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/infiniband/mlx5_0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660984.937350] [node63:3385663:0] topo.c:475 UCX DEBUG mlx5_0: bdf_name 0000:17:00.0 sys_dev 1
[1696660984.937372] [node63:3385663:0] ib_device.c:487 UCX DEBUG mlx5_0: vendor_id 0x15b3 device_id 4117
[1696660984.937974] [node63:3385663:0] ib_mlx5dv_md.c:1264 UCX DEBUG mlx5_0: crossing_vhca_mkey is not supported
[1696660984.938164] [node63:3385663:0] ib_mlx5dv_md.c:880 UCX DEBUG mlx5_0: ODP is disabled because version 1 is not supported for DevX QP
[1696660984.938384] [node63:3385663:0] async.c:232 UCX DEBUG added async handler 0xa1d320 [id=4 ref 1] ???() to hash
[1696660984.938455] [node63:3385663:0] async.c:494 UCX DEBUG listening to async event fd 4 events 0x1 mode thread_spinlock
[1696660984.938462] [node63:3385663:0] ib_device.c:586 UCX DEBUG initialized device 'mlx5_0' (InfiniBand channel adapter) with 1 ports
[1696660984.938472] [node63:3385663:0] ib_md.c:1115 UCX DEBUG mlx5_0: cuda GPUDirect RDMA is disabled
[1696660984.938478] [node63:3385663:0] ib_md.c:1115 UCX DEBUG mlx5_0: rocm GPUDirect RDMA is disabled
[1696660984.938490] [node63:3385663:0] ib_md.c:1140 UCX DEBUG mlx5_0: dmabuf is supported
[1696660984.938497] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool devx dbrec: align 64, maxelems 4294967295, elemsize 40
[1696660984.938739] [node63:3385663:0] ib_mlx5dv_md.c:1341 UCX DEBUG mlx5_0: opened DEVX md log_max_qp=17
[1696660984.938744] [node63:3385663:0] ib_md.c:1103 UCX DEBUG mlx5_0: relaxed order memory access is disabled
[1696660984.939190] [node63:3385663:0] ib_mlx5dv_md.c:1011 UCX DEBUG created indirect rkey 0xae00 for remote flush
[1696660984.939194] [node63:3385663:0] ib_md.c:1054 UCX DEBUG mlx5_0: md open by 'uct_ib_mlx5_devx_md_ops' is successful
[1696660984.939228] [node63:3385663:0] ib_device.c:1052 UCX DEBUG no compatible IB ports found for flags 0xc4
[1696660984.939233] [node63:3385663:0] uct_md.c:97 UCX DEBUG failed to query dc_mlx5 resources: No such device
[1696660984.940832] [node63:3385663:0] ib_iface.c:927 UCX DEBUG using pkey[0] 0xffff on mlx5_0:1/RoCE
[1696660984.940891] [node63:3385663:0] ib_device.c:916 UCX DEBUG mlx5_0:1 using gid_index 3
[1696660984.941917] [node63:3385663:0] ib_iface.c:1453 UCX DEBUG created uct_ib_iface_t headroom_ofs 12 payload_ofs 16 hdr_ofs 15 data_sz 8256
[1696660984.941951] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_recv_desc: align 64, maxelems 4294967295, elemsize 8279
[1696660984.941955] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_send_desc: align 64, maxelems 4294967295, elemsize 8328
[1696660984.942041] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool send-ops-mpool: align 64, maxelems 4294967295, elemsize 56
[1696660984.942491] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool pending-ops: align 1, maxelems 4294967295, elemsize 64
[1696660984.942502] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_verbs_short_desc: align 64, maxelems 4294967295, elemsize 200
[1696660984.942986] [node63:3385663:0] ib_iface.c:1052 UCX DEBUG iface=0xa24010: created RC QP 0x1a919 on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.948402] [node63:3385663:0] mpool.c:282 UCX DEBUG mpool rc_recv_desc: allocated chunk 0x7fc7c1a00018 of 37748712 bytes with 4537 elements
Using rc_verbs/mlx5_0:1
[1696660984.949740] [node63:3385663:0] ib_iface.c:1052 UCX DEBUG iface=0xa24010: created RC QP 0x1a91a on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.949767] [node63:3385663:0] rc_ep.c:165 UCX DEBUG created rc ep 0xa1ee60
[1696660984.950046] [node63:3385663:0] ib_iface.c:809 UCX DEBUG iface 0xa24010: ah_attr dlid=49152 sl=0 port=1 src_path_bits=0 dgid=::ffff:172.17.29.63 flow_label=0xffffffff sgid_index=3 traffic_class=106
[1696660984.950441] [node63:3385663:0] rc_iface.c:934 UCX DEBUG connected rc qp 0x1a91a on mlx5_0:1/RoCE to lid 49152( 0) sl 0 remote_qp 0x1a91b mtu 1024 timer 18x7 rnr 13x7 rd_atom 16
[1696660984.951064] [node63:3385663:a] ib_device.c:468 UCX DEBUG IB Async event on mlx5_0: SRQ-attached QP 0x1a91a was flushed
[1696660984.951190] [node63:3385663:0] rc_ep.c:185 UCX DEBUG destroy rc ep 0xa1ee60
[1696660984.953127] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_verbs_short_desc destroyed
[1696660984.953408] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool send-ops-mpool destroyed
[1696660984.953414] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_send_desc destroyed
[1696660984.953740] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_recv_desc destroyed
[1696660984.953748] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool pending-ops destroyed
[1696660984.954492] [node63:3385663:0] ib_mlx5dv_md.c:1399 UCX DEBUG mlx5_0: md=0xa1f3f0 md->flags=0x3f01a3 flush_rkey=0xae00
[1696660984.955001] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool devx dbrec destroyed
[1696660984.955012] [node63:3385663:0] ib_device.c:605 UCX DEBUG destroying ib device mlx5_0
[1696660984.955021] [node63:3385663:0] async.c:157 UCX DEBUG removed async handler 0xa1d320 [id=4 ref 1] ???() from hash
[1696660984.955027] [node63:3385663:0] async.c:547 UCX DEBUG removing async handler 0xa1d320 [id=4 ref 1] ???()
[1696660984.955073] [node63:3385663:0] async.c:172 UCX DEBUG release async handler 0xa1d320 [id=4 ref 0] ???()
[root@node63 ucx]#
参考
UCT设计: https://github.com/openucx/ucx/wiki/UCT-Design
UCT文档: https://openucx.readthedocs.io/en/master/ucx_features.html
UCX项目原版: https://github.com/openucx/ucx.git
晓兵笔记版: https://github.com/ssbandjl/ucx
晓兵
博客: https://logread.cn | https://blog.csdn.net/ssbandjl | https://cloud.tencent.com/developer/user/5060293/articles