DAOS-VOS版本化对象存储-NVME调度-轮询-水位线等流程分析

2023-11-18 07:55:24 浏览数 (2)

术语/名词解释/短语

EN

CN

DETAIL

DF: Durable format

持久格式

如VOS池: struct vos_pool_df

Extent

范围,区段

DAV

DAOS内部针对VOS分配暴露的接口

Interfaces exported by DAOS internal Allocator for VOS (DAV)

BIO

二进制大对象IO,原名EIO(分段/范围IO)

重命名 EIO (Extent I/O) -> BIO (Blob I/O)

BIO/VOS相关数据结构

VEA相关数据结构

创建VOS池参考流程

代码语言:javascript复制
----- vos -----
tgt_vos_create_one
  path_gen /mnt/daos/93c634a2-d122-4914-b4b3-5ba76c696f68/vos-0 创建vos目录
  vos_pool_create 创建vos池
    *xs_ctxt = vos_xsctxt_get() 上下文是怎样初始化的?
    uma.uma_id = UMEM_CLASS_PMEM 设置内存类型
    if (flags & VOS_POF_SMALL) 小池, 置上独立标记位(VOS_POF_EXCL)
    pool_lookup(&ukey, &pool) 根据key查询池
      hlink = d_uhash_link_lookup(vos_pool_hhash_get(), ukey, NULL)
        vos_tls_get()->vtl_pool_hhash
      *pool = pool_hlink2ptr(hlink)
    vos_pmemobj_create 0600 -rw- (600) 只有拥有者有读写权限 在创建/连接/关闭池时的blob对应的操作, /mnt/daos/daos_sys/sys_db
    vos_pmemobj_create(path, uuid, VOS_POOL_LAYOUT, scm_sz, nvme_sz, wal_sz, flags, &ph)
      struct bio_xs_context	*xs_ctxt = vos_xsctxt_get() -> 每个xs一个nvme上下文
      store.store_type = umempobj_get_backend_type() -> 默认pm作后端, daos_md_backend = DAOS_MD_PMEM
      bio_mc_create(xs_ctxt, pool_id, meta_sz, wal_sz, nvme_sz, mc_flags) -> 元数据的blobstore需要单独配置(默认不配置)
        bio_blob_create(uuid, xs_ctxt, nvme_sz)
            spdk_bs_get_cluster_size(bbs->bb_bs) -> 默认集群1GB
            spdk_blob_opts_init(&bma.bma_opts, sizeof(bma.bma_opts))
            bma.bma_opts.num_clusters = (blob_sz   cluster_sz - 1) / cluster_sz -> 集群个数: 4833741823/1073741824 = 4.5 = 4
            smd_pool_get_blob
            blob_cp_arg_init(ba)
            bio_bs_hold(bbs) -> 增加blob_stor的引用计数
            spdk_thread_send_msg(owner_thread(bbs), blob_msg_create, &bma)
                spdk_bs_create_blob_ext(arg->bma_bs, &arg->bma_opts, blob_create_cb,msg_arg) -> 给定 blobstore 选项创建新的 blobstore
                    ABT_eventual_set(ba->bca_eventual, NULL, 0) -> 在回调中设置结果
            blob_wait_completion(xs_ctxt, ba)
            smd_pool_add_tgt(uuid, xs_ctxt->bxc_tgt_id, ba->bca_id, st, blob_sz)
                pool_add_tgt(pool_id, tgt_id, blob_id, TABLE_POOLS[st], blob_sz) -> 将 blobstore 加入目标池,下次就能查询到
            spdk_bs_create_blob_ext
        blob_wait_completion
            xs_poll_completion
                spdk_thread_poll(ctxt->bxc_thread, 0, 0)
                在线程上执行一次迭代处理。 这包括过期的和连续的轮询器以及消息。 如果线程已经退出,则立即返回
            ABT_eventual_wait(ba->bca_eventual, NULL) -> 或用协程等待结果        
      bio_mc_open(xs_ctxt, pool_id, mc_flags, &mc) -> smd_pool_get_blob -> pool_get_blob -> 查找该池的BLOB
        db_fetch(struct sys_db *db, char *table, d_iov_t *key, d_iov_t *val)
          db_io_init(&io, table, key, val)
          vos_obj_fetch -> vos_obj_fetch_ex
            vos_fetch_begin
              vos_ioc_create
              vos_dth_set
              vos_ts_set_add -> vos_ts_get_negative -> 当子树不存在时,我们需要一个负条目。 在这种情况下,条目由哈希值标识。 这会查找负条目并在必要时分配它。 将 te_create_idx 重置为 NULL
        __bio_ioctxt_open(&bio_mc->mc_data, xs_ctxt, pool_id, flags, SMD_DEV_TYPE_DATA, data_blobid)
            bio_blob_open(ctxt, false, flags, st, open_blobid)
                ctxt->bic_io_unit = spdk_bs_get_io_unit_size(bbs->bb_bs) -> 获取spdk最小IO单元(默认512字节)
                bma->bma_async = async -> 关闭异步(默认同步)
                spdk_thread_send_msg(owner_thread(bbs), blob_msg_open, bma)
                    spdk_bs_open_blob(arg->bma_bs, arg->bma_blob_id, blob_open_cb, msg_arg) -> 打开blobstore
      bio_meta_get_attr(mc, &store.stor_size, &store.stor_blk_size, &store.stor_hdr_blks)
      store.stor_ops = &vos_store_ops -> 设置blobstore的操作为VOS函数操作表
      pop = umempobj_create(path, layout, UMEMPOBJ_ENABLE_STATS, scm_sz, 0600, &store)
        pop = pmemobj_create(path, layout_name, poolsize, mode) -> 创建持久内存对象,取scm大小 -> pmdk笔记: https://blog.csdn.net/Sylvia_Wu51/article/details/117442789, 创建pm内存池: https://manpages.debian.org/experimental/libpmemobj-dev/pmemobj_create.3.en.html, 参考:持久内存编程开发人员综合指南
      pmemobj_ctl_set(pop, "stats.enabled", &enabled) -> 启用使用率统计, scm和nvme
      register_slabs(umm_pool) -> register_slabs(struct umem_pool *ph_p) -> slab_map[] -> 为内存池注册不同大小的slab(缓存)
        __builtin_ctz(used) -> 返回输入数二进制表示从最高位开始(左起)的连续的0的个数
        set_slab_desc(ph_p, slab) -> 创建slab
            pmemslab.unit_size = slab->unit_size -> 32
            pmemslab.units_per_block = 1000 -> 8158
            pmemobj_ctl_set(pop, "heap.alloc_class.new.desc", &pmemslab) -> 创建pool_set, 参考: https://cloud.tencent.com/developer/article/1747314, 官方API: http://wlemkows.github.io/pmdk/manpages/linux/v1.6/libpmemobj/pmemobj_ctl_get.3
    pool_df = vos_pool_pop2df(ph) pm转持久格式 -> umempobj_get_rootptr
      TOID(struct vos_pool_df) pool_df
      pool_df = POBJ_ROOT(pop, struct vos_pool_df) 获取根
        pmemobj_root(PMEMobjpool *pop, size_t size)
    scm_sz = lstat.st_size -> 228MB(240001024)        
    uma.uma_id = umempobj_backend_type2class_id(ph->up_store.store_type)
    umem_class_init(&uma, &umem) 通过uma中的属性实例化一个内存类umm -> static umem_ops_t	vmem_ops
      umm->umm_ops		= umc->umc_ops -> static umem_ops_t	pmem_ops
      set_offsets(umm) 为池计算出必要的偏移量和基地址, 如果是vmem则置零
        pmemobj_root -> 获取根
        pmemobj_direct -> 返回根的指针
        umm->umm_base = (uint64_t)root - root_oid.off -> 缓存该池的基地址
    umem_tx_begin(&umem, NULL) 开始事务
      umm->umm_ops->mo_tx_begin(umm, txd)  .mo_tx_begin		= pmem_tx_begin, 开始pm事务
        pmemobj_tx_begin(pop, NULL, TX_PARAM_NONE)
        pmemobj_tx_begin(umm->umm_pool  可变参数
        pmdk事务 libpmemobj就提供了存储的事务特性
        https://www.cnblogs.com/Kimbing-Ng/p/12738735.html 
        https://pmem.io/pmdk/manpages/linux/v1.4/libpmemobj/libpmemobj.7/  
        https://manpages.debian.org/experimental/libpmemobj-dev/pmemobj_tx_begin.3.en.html
        https://my.oschina.net/fileoptions/blog/1629405
    umem_tx_add_ptr(&umem, pool_df, sizeof(*pool_df))
      mo_tx_add_ptr -> pmem_tx_add_ptr
        pmemobj_tx_add_range_direct(ptr, size) 添加内存范围
    memset(pool_df, 0, sizeof(*pool_df))        
    dbtree_create_inplace(VOS_BTR_CONT_TABLE 创建b树, 头文件: btree.h
      dbtree_create_inplace_ex
        btr_context_create
          btr_class_init
            umem_class_init(uma, &tins->ti_umm)
            tins->ti_ops = tc->tc_ops 从注册的b树中初始化,设置标记位,函数操作表
          btr_context_set_depth
        btr_tx_tree_init
          btr_tx_begin(tcx)
          btr_tree_init(tcx, root)
            btr_root_init(tcx, root, true)
              btr_root_tx_add(tcx)
          btr_tx_end(tcx, rc)
        btr_tcx2hdl
          hdl.cookie = (uint64_t)tcx
    dbtree_close(hdl) 减引用
    pool_df->pd_magic	= POOL_DF_MAGIC -> 设置vos池属性
    gc_init_pool(&umem, pool_df) -> 初始化VOS垃圾回收池(GC)
      GC_MAX = 4
      struct vos_gc_bin_df *bin = &pd->pd_gc_bins[i]
      bin->bin_bag_size  = gc_bag_size 包大小=250
      bag_id = umem_zalloc(umm, size)
        mo_tx_alloc -> pmem_tx_alloc(pmemobj_tx_xalloc分配一个新对象) | vmem_alloc(calloc|malloc)
    umem_tx_commit(&umem) | umem_tx_abort(&umem, rc)
      mo_tx_commit 提交事务 -> pmem_tx_commit
        pmemobj_tx_commit
        pmemobj_tx_end
    bio_nvme_configured // 以下只针对配置了Nvme大小的池, 纯SCM(vos池)不走下面的逻辑
    // Format SPDK blob header 格式化spdk
    blob_hdr.bbh_blk_sz = VOS_BLK_SZ -> 块大小为4K
    vea_format(&umem, vos_txd_get(flags & VOS_POF_SYSDB), &pool_df->pd_vea_df,VOS_BLK_SZ, VOS_BLOB_HDR_BLKS, nvme_sz, vos_blob_format_cb, &blob_hdr, false) -> 格式化分配器(带版本管理的盘区元数据分配器)
        erase_md(umem, md) -> 如果魔术字存在且允许强制重新格式化,则销毁该元数据
            dbtree_open_inplace
            dbtree_destroy
        (blk_sz && ((blk_sz % VEA_BLK_SZ) != 0 || blk_sz > (1U << 20))) -> 块大小必须4K对齐且不超过1MB
        rc = cb(cb_data) -> vos_blob_format_cb
        umem_tx_begin(umem, txd)
        umem_tx_add_ptr(umem, md, sizeof(*md))
        dbtree_create_inplace
        dbtree_update(free_btr, &key, &val)
        ...
        return rc ? umem_tx_abort(umem, rc) : umem_tx_commit(umem) -> 提交或终止/回滚事务
            pmemobj_tx_commit()
            pmemobj_tx_end()
    pool_open(ph, pool_df, uuid, flags, poh) // 打开池
      pool_alloc(uuid, &pool)
        d_uhash_ulink_init(&pool->vp_hlink, &pool_uuid_hops) uuid hash tab
        D_INIT_LIST_HEAD(&pool->vp_gc_link)
        D_INIT_LIST_HEAD(&pool->vp_gc_cont)
      bio_ioctxt_open(&pool->vp_dummy_ioctxt, vos_xsctxt_get(), pool->vp_id, true)
        __bio_ioctxt_open(pctxt, xs_ctxt, uuid, 0, SMD_DEV_TYPE_DATA, SPDK_BLOBID_INVALID)
      vos_register_slabs(uma) 注册7个slab -> 最新版已去掉该逻辑
        set_slab_prop(i, slab) slab->unit_size: 单个分配单元中的字节数。 单个分配最多可跨越 64 个单元(或在没有标头的情况下为 1 个)。 如果创建一个具有特定单元大小的分配类并强制它处理更大的大小,那么将使用多个单元。 例如,具有紧凑标头和 128 字节单元大小的分配类,对于 200 字节的请求,将创建一个包含 256 字节且跨越两个单元的内存块。 该分配的可用大小为 240 字节:2 * 128 - 16(标头)
          vos_tree_get_overhead(0, tclass, 0, &ovhd) 返回常量,可用于估计持久内存磁盘格式的元数据开销
            evt_overhead_get
            dbtree_overhead_get
              hkey_size = btr_hkey_size_const(ops, ofeat) 从树类中找: enum vos_tree_class 
                size = ops->to_hkey_size() oi_hkey_size
              ovhd->to_record_msize = ops->to_rec_msize(alloc_overhead) oi_rec_msize
          pmemobj_ctl_set heap.alloc_class.new.desc 以编程方式执行写入 ctl 查询
      umem_class_init
      dbtree_open_inplace_ex
        btr_context_create
        btr_tcx2hdl
      vos_xsctxt_get
      bio_ioctxt_open 打开每个vos的io上下文
        D_INIT_LIST_HEAD(&ctxt->bic_link)
        bio_bs_hold 有nvme才执行
        bio_blob_open(struct bio_io_context *ctxt, bool async, enum bio_mc_flags flags, enum smd_dev_type st, spdk_blob_id open_blobid)
            ctxt->bic_blob = ba->bca_blob
      vea_load 从 SCM 加载空间跟踪信息以初始化内存中的复合索引
      vos_dedup_init
        d_hash_table_create(D_HASH_FT_NOLOCK, 13, /* 8k buckets */ NULL, &dedup_hash_ops, &pool->vp_dedup_hash);
      pool_link
        d_uhash_link_insert(vos_pool_hhash_get(pool->vp_sysdb), ukey, NULL, &pool->vp_hlink)
        vos_pool2hdl
      vos_space_sys_init(pool) -> 初始化在VOS中保留的空间(系统空间)
        POOL_SCM_SYS
        get_frag_overhead
        gc_reserve_space(&pool->vp_space_sys[0]) gc保留
        agg_reserve_space(&pool->vp_space_sys[0]); 聚合保留
        tiny pool 小池不预留
      gc_add_pool 附加一个用于 GC 的池,此函数还将池固定在打开的哈希表中。如果 GC 没有留下任何内容,GC 将从开放哈希中删除该池,并且用户已经关闭它
        d_list_add_tail(&pool->vp_gc_link, &tls->vtl_gc_pools) 加入链表, 取链表中的数据 pool = d_list_entry(tls->vtl_gc_pools.next
        vos_gc_run 其他线程执行gc
          pool = d_list_entry(pools->next, struct vos_pool, vp_gc_link)
          gc_reclaim_pool 为池开启垃圾回收
      lock_pool_memory
        getrlimit(RLIMIT_MEMLOCK, &rlim) 获取软限制和硬限制, 这是可以锁定到 RAM 中的最大内存字节数, 值 RLIM_INFINITY 表示对资源没有限制(在 getrlimit() 返回的结构和传递给 setrlimit() 的结构中),一般都有限制, 上面使用 PRIuMAX 宏的解决方案很好,但是从 C99 开始还有 j 类型修饰符,用于打印 intmax_t 和 uintmax_t 类型的值:
        printf("Soft limit: %" PRIuMAX " bytesn", (uintmax_t)cur_bytes);
        printf("Soft limit: %ju bytesn", (uintmax_t)cur_bytes);
        mlock((void *)pool->vp_umm.umm_base, pool->vp_pool_df->pd_scm_sz) 锁内存, 保证范围 [ADDR,ADDR LEN) 映射的所有整页都驻留在内存中(no swap)
    vos_pmemobj_close(ph)
        umempobj_close(pop)
            pmemobj_close(pop)
        bio_mc_close(store.stor_priv)
            bio_ioctxt_close(bio_mc->mc_data)

rsvc 复制服务
架构, daos vos, daos元数据, 核心, daos基石
[ pool_svc, cont_svc, ... ]
[ ds_rsvc ]
[                rdb                ]
[ raft ]
[                vos                ]
umm
pmdk/mem/bio/vea

VOS相关的参数及流程片段

代码语言:c复制
DAOS的IO记录列表(包含epoch)
struct daos_recx_ep_list {
	/** #valid items in re_items array */
	uint32_t		 re_nr;
	/** #total items (capacity) in re_items array */
	uint32_t		 re_total;
	/** recovery from snapshot flag */
	bool			 re_snapshot;
	/** epoch valid flag, re_items' re_ep can be ignored when it is false */
	bool			 re_ep_valid;
	struct daos_recx_ep	*re_items;
};
DAOS-2448 vos:为 EC 降级获取添加影子扩展/纪元 (#2108),为 EC 降级获取添加 VOS 级别影子相关处理,它将用作:
1. 客户端向奇偶校验目标发送降级的获取请求。
2. 在奇偶校验服务器上 - 查询奇偶校验空间以获取存在的奇偶校验扩展为影子。 获取数据空间,以及上面步骤中获得的影子列表。 将shadow上面的数据传回,并回复recx_list@shadow。
3. 如果recx_list@shadow非空,客户端需要恢复/重建丢失的数据。


块IO描述符
struct bio_desc {
    ...
    struct bio_sglist	 bd_sgls[0]
}

每个VOS实例关联的IO上下文
/* Per VOS instance I/O context */
struct bio_io_context {
	d_list_t		 bic_link; /* link to bxb_io_ctxts */
	struct spdk_blob	*bic_blob; // SPDK的BLOB
	struct bio_xs_blobstore	*bic_xs_blobstore;
	struct bio_xs_context	*bic_xs_ctxt;
	uint32_t		 bic_inflight_dmas;
	uint32_t		 bic_io_unit;
	uuid_t			 bic_pool_id;
	unsigned int		 bic_opening:1,
				 bic_closing:1,
				 bic_dummy:1;
};



内存模块: src/common/mem.c

内存类型: 
typedef enum {
	/** volatile memory */
	UMEM_CLASS_VMEM,
	/** persistent memory */
	UMEM_CLASS_PMEM,
	/** persistent memory but ignore PMDK snapshot */
	UMEM_CLASS_PMEM_NO_SNAP,
	/** blob backed memory */
	UMEM_CLASS_BMEM,
	/** ad-hoc memory */
	UMEM_CLASS_ADMEM,
	/** unknown */
	UMEM_CLASS_UNKNOWN,
} umem_class_id_t;


内存中的元数据上下文:
/* In-memory Meta context, exported as opaque data structure */
struct bio_meta_context {
	struct bio_io_context	*mc_data;	/* Data blob I/O context */
	struct bio_io_context	*mc_meta;	/* Meta blob I/O context */
	struct bio_io_context	*mc_wal;	/* WAL blob I/O context */
	struct meta_header	 mc_meta_hdr;	/* Meta blob header */
	struct wal_super_info	 mc_wal_info;	/* WAL blob super information */
	struct hash_ft		*mc_csum_algo;
	void			*mc_csum_ctx;
};

全局默认blobstore的集群大小为1GB
nvme_glb.bd_bs_opts.cluster_sz = (1UL << 30);	/* 1GB */

每个IO通道的最大并发飞行的blob个数为4096, 用于初始化SPDK
/* Max in-flight blob IOs per io channel */
#define BIO_BS_MAX_CHANNEL_OPS	(4096)

一个IO通道超过2048个 blob IO 在排队等待时, 启动/调度NVMe开始轮询
/* Schedule a NVMe poll when so many blob IOs queued for an io channel */
#define BIO_BS_POLL_WATERMARK	(2048) -> bio_need_nvme_poll

协程调度器最近的优化:
DAOS-3745 iosrv:改进服务器调度程序(#1646),改进 iosrv 调度程序:
- 根据ULT作业类型重新定义ABT池,基本上相同优先级的ULT将被放入同一个ABT池中;
- 在原始调度程序中,无论当前 IO 工作负载如何,重建 ULT 始终有 30% 的机会被调度,此补丁已修复;
- 调度程序现在能够控制网络和 NVMe 轮询的频率,并根据请求/IO 统计数据或空间压力动态限制某些类型的 ULT;
代码清理:
- 将调度程序代码移至 sched.c
- 将线程集合 & ult 创建代码移至 ult.c 中;
- 将 ABT 池定义从 daos_server.h 移至 srv_internal.h;
待办:
- 修改 dss_srv_handler() 使其成为纯网络轮询 ULT;
- 根据请求/IO统计调整硬件轮询频率;
- 对空间压力进行节流 IO ULT;

...
dss_sched_init(struct dss_xstream *dx)
    .run	= sched_run, -> sched_run(ABT_sched sched)
        while (1)
            unit = sched_pop_net_poll(data, pool)
            unit = sched_pop_nvme_poll(data, pool)
                if (!need_nvme_poll(dx, cycle)) -> need_nvme_poll(struct dss_xstream *dx, struct sched_cycle *cycle)
                    return bio_need_nvme_poll(dmi->dmi_nvme_ctxt)
                       if (bxb && bxb->bxb_blob_rw > BIO_BS_POLL_WATERMARK) -> 如果当前协程对应的blobstore飞行中的读写IO个数超过2048个, 则开启nvme轮训
                ret = ABT_pool_pop(pool, &unit)    
            ...

当排队的blob IO超过4000个时, 停止下发新的blob IO
/* Stop issuing new IO when queued blob IOs reach a threshold */
#define BIO_BS_STOP_WATERMARK	(4000) -> drain_inflight_ios

...
nvme_rw(struct bio_desc *biod, struct bio_rsrvd_region *rg) -> 读写NVME
    while (pg_cnt > 0)
        drain_inflight_ios(xs_ctxt, bxb) -> 读写IO前, 如果排队的blob IO超过4000个,则驱动spdk轮询或周期性让出CPU, 直到排队IO降低到水位线以下
            do {
                if (ctxt->bxc_self_polling)
                    spdk_thread_poll(ctxt->bxc_thread, 0, 0);
                else
                    bio_yield(NULL);
            } while (bxb->bxb_blob_rw >= BIO_BS_STOP_WATERMARK);
...


blob元数据头:
/* Meta blob header */
struct meta_header {
	uint32_t	mh_magic;
	uint32_t	mh_version;
	uuid_t		mh_meta_devid;		/* Meta SSD device ID */
	uuid_t		mh_wal_devid;		/* WAL SSD device ID */
	uuid_t		mh_data_devid;		/* Data SSD device ID */
	uint64_t	mh_meta_blobid;		/* Meta blob ID */
	uint64_t	mh_wal_blobid;		/* WAL blob ID */
	uint64_t	mh_data_blobid;		/* Data blob ID */
	uint32_t	mh_blk_bytes;		/* Block size for meta, in bytes */
	uint32_t	mh_hdr_blks;		/* Meta blob header size, in blocks */
	uint64_t	mh_tot_blks;		/* Meta blob capacity, in blocks */
	uint32_t	mh_vos_id;		/* Associated per-engine target ID */
	uint32_t	mh_flags;		/* Meta header flags */
	uint32_t	mh_padding[5];		/* Reserved */
	uint32_t	mh_csum;		/* Checksum of this header */
};




VOS函数操作表
struct umem_store_ops vos_store_ops = {
	.so_load	= vos_meta_load,
	.so_read	= vos_meta_readv,
	.so_write	= vos_meta_writev,
	.so_flush_prep	= vos_meta_flush_prep,
	.so_flush_copy	= vos_meta_flush_copy,
	.so_flush_post	= vos_meta_flush_post,
	.so_wal_reserv	= vos_wal_reserve,
	.so_wal_submit	= vos_wal_commit,
	.so_wal_replay	= vos_wal_replay, -> 重放预写日志
	.so_wal_id_cmp	= vos_wal_id_cmp,
};


内存池中的缓存大小:
#define UMM_SLABS_CNT 16
/** Define common slabs.  We can refine this for 2.4 pools but that is for next patch */
static const int        slab_map[] = {
    0,          /* 32 bytes */
    1,          /* 64 bytes */
    2,          /* 96 bytes */
    3,          /* 128 bytes */
    4,          /* 160 bytes */
    5,          /* 192 bytes */
    6,          /* 224 bytes */
    7,          /* 256 bytes */
    8,          /* 288 bytes */
    -1, 9,      /* 352 bytes */
    10,         /* 384 bytes */
    11,         /* 416 bytes */
    -1, -1, 12, /* 512 bytes */
    -1, 13,     /* 576 bytes (2.2 compatibility only) */
    -1, -1, 14, /* 672 bytes (2.2 compatibility only) */
    -1, -1, 15, /* 768 bytes (2.2 compatibility only) */
};


Durable format for VOS pool VOS池的持久格式
struct vos_pool_df


每个VOS池关联的SPDK Blob的头部信息
/**
 * Header for SPDK blob per VOS pool
 */
struct bio_blob_hdr {
	uint32_t	bbh_magic;
	uint32_t	bbh_blk_sz;
	uint32_t	bbh_hdr_sz; /* blocks reserved for blob header */
	uint32_t	bbh_vos_id; /* service xstream id */
	uint64_t	bbh_blob_id;
	uuid_t		bbh_blobstore;
	uuid_t		bbh_pool;
};


/**
 * VOS pool (DRAM) 内存中的VOS池结构
 */
struct vos_pool

创建4GB的池,VEA元数据块如下

更新btree的K/V

VOS拓扑

晓兵(ssbandjl)

博客: https://logread.cn | https://blog.csdn.net/ssbandjl | https://cloud.tencent.com/developer/user/5060293/articles

DAOS汇总: https://cloud.tencent.com/developer/article/2344030

晓兵技术杂谈(系列: DAOS/RDMA/UCX/Mercury/Libfabric/分布式存储等)

视频: https://cloud.tencent.com/developer/user/5060293/video

博客: https://cloud.tencent.com/developer/column/99669

欢迎对DAOS, SPDK, RDMA, 协程等高性能技术感兴趣的朋友加入DAOS技术交流(群)

0 人点赞