zpool创建 代码语言: javascript
复制 // 创建一个zpool
$ modprobe zfs
$ zpool create -f -m /sample sample -o ashift=12 /dev/sdc
$ zfs create sample/fs1
-o mountpoint=/sample/fs1
-o atime=off
-o canmount=on
-o compression=lz4
-o quota=100G
-o recordsize=8k
-o logbias=throughput
// 手动umount和挂载
$ umount /sample/fs1
$ zfs mount sample/fs1
zpool和zfs参数获取 代码语言: javascript
复制 // zfs pool默认的参数获取
$ zpool get all
// zfs pool挂载文件系统的参数获取
$ zfs get all
zfs和内核之间的桥梁 代码语言: javascript
复制 const struct super_operations zpl_super_operations = {
.alloc_inode = zpl_inode_alloc,
.destroy_inode = zpl_inode_destroy,
.dirty_inode = zpl_dirty_inode,
.write_inode = NULL,
.evict_inode = zpl_evict_inode,
.put_super = zpl_put_super,
.sync_fs = zpl_sync_fs,
.statfs = zpl_statfs,
.remount_fs = zpl_remount_fs,
.show_devname = zpl_show_devname,
.show_options = zpl_show_options,
.show_stats = NULL,
};
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
代码语言: javascript
复制 extern const struct inode_operations zpl_inode_operations;
extern const struct inode_operations zpl_dir_inode_operations;
extern const struct inode_operations zpl_symlink_inode_operations;
extern const struct inode_operations zpl_special_inode_operations;
extern dentry_operations_t zpl_dentry_operations;
extern const struct address_space_operations zpl_address_space_operations;
extern const struct file_operations zpl_file_operations;
extern const struct file_operations zpl_dir_file_operations;
/* zpl_super.c */
extern void zpl_prune_sb(int64_t nr_to_scan, void *arg);
extern const struct super_operations zpl_super_operations;
extern const struct export_operations zpl_export_operations;
extern struct file_system_type zpl_fs_type;
const struct inode_operations zpl_inode_operations = {
.setattr = zpl_setattr,
.getattr = zpl_getattr,
#ifdef HAVE_GENERIC_SETXATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
#endif
.listxattr = zpl_xattr_list,
#if defined(CONFIG_FS_POSIX_ACL)
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
.get_acl = zpl_get_acl,
#endif /* CONFIG_FS_POSIX_ACL */
};
const struct inode_operations zpl_dir_inode_operations = {
.create = zpl_create,
.lookup = zpl_lookup,
.link = zpl_link,
.unlink = zpl_unlink,
.symlink = zpl_symlink,
.mkdir = zpl_mkdir,
.rmdir = zpl_rmdir,
.mknod = zpl_mknod,
#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
.rename = zpl_rename2,
#else
.rename = zpl_rename,
#endif
#ifdef HAVE_TMPFILE
.tmpfile = zpl_tmpfile,
#endif
.setattr = zpl_setattr,
.getattr = zpl_getattr,
#ifdef HAVE_GENERIC_SETXATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
#endif
.listxattr = zpl_xattr_list,
#if defined(CONFIG_FS_POSIX_ACL)
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
.get_acl = zpl_get_acl,
#endif /* CONFIG_FS_POSIX_ACL */
};
const struct inode_operations zpl_symlink_inode_operations = {
#ifdef HAVE_GENERIC_READLINK
.readlink = generic_readlink,
#endif
#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
.get_link = zpl_get_link,
#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
.follow_link = zpl_follow_link,
#endif
#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
.put_link = zpl_put_link,
#endif
.setattr = zpl_setattr,
.getattr = zpl_getattr,
#ifdef HAVE_GENERIC_SETXATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
#endif
.listxattr = zpl_xattr_list,
};
const struct inode_operations zpl_special_inode_operations = {
.setattr = zpl_setattr,
.getattr = zpl_getattr,
#ifdef HAVE_GENERIC_SETXATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.removexattr = generic_removexattr,
#endif
.listxattr = zpl_xattr_list,
#if defined(CONFIG_FS_POSIX_ACL)
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
.get_acl = zpl_get_acl,
#endif /* CONFIG_FS_POSIX_ACL */
};
dentry_operations_t zpl_dentry_operations = {
.d_revalidate = zpl_revalidate,
};
代码语言: javascript
复制 extern const struct inode_operations zpl_inode_operations;
extern const struct inode_operations zpl_dir_inode_operations;
extern const struct inode_operations zpl_symlink_inode_operations;
extern const struct inode_operations zpl_special_inode_operations;
extern dentry_operations_t zpl_dentry_operations;
extern const struct address_space_operations zpl_address_space_operations;
extern const struct file_operations zpl_file_operations;
extern const struct file_operations zpl_dir_file_operations;
const struct address_space_operations zpl_address_space_operations = {
.readpages = zpl_readpages,
.readpage = zpl_readpage,
.writepage = zpl_writepage,
.writepages = zpl_writepages,
.direct_IO = zpl_direct_IO,
};
const struct file_operations zpl_file_operations = {
.open = zpl_open,
.release = zpl_release,
.llseek = zpl_llseek,
#ifdef HAVE_VFS_RW_ITERATE
#ifdef HAVE_NEW_SYNC_READ
.read = new_sync_read,
.write = new_sync_write,
#endif
.read_iter = zpl_iter_read,
.write_iter = zpl_iter_write,
#ifdef HAVE_VFS_IOV_ITER
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
#endif
#else
.read = do_sync_read,
.write = do_sync_write,
.aio_read = zpl_aio_read,
.aio_write = zpl_aio_write,
#endif
.mmap = zpl_mmap,
.fsync = zpl_fsync,
#ifdef HAVE_FILE_AIO_FSYNC
.aio_fsync = zpl_aio_fsync,
#endif
.fallocate = zpl_fallocate,
.unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl,
#endif
};
const struct file_operations zpl_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
#if defined(HAVE_VFS_ITERATE_SHARED)
.iterate_shared = zpl_iterate,
#elif defined(HAVE_VFS_ITERATE)
.iterate = zpl_iterate,
#else
.readdir = zpl_readdir,
#endif
.fsync = zpl_fsync,
.unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl,
#endif
};
代码语言: javascript
复制 typedef const struct dentry_operations dentry_operations_t;
dentry_operations_t zpl_dops_snapdirs = {
/*
* Auto mounting of snapshots is only supported for 2.6.37 and
* newer kernels. Prior to this kernel the ops->follow_link()
* callback was used as a hack to trigger the mount. The
* resulting vfsmount was then explicitly grafted in to the
* name space. While it might be possible to add compatibility
* code to accomplish this it would require considerable care.
*/
.d_automount = zpl_snapdir_automount,
.d_revalidate = zpl_snapdir_revalidate,
};
dentry_operations_t zpl_dentry_operations = {
.d_revalidate = zpl_revalidate,
};
zfs io(zio) pipeline概述 基本功能 zio服务zfs所有的IO操作 负责转换dva(data virtual adress)到硬件磁盘缩在vdevs 提供动态压缩、去重、加密、checksum等用户测应用策略实现 实现mirroring和raid z功能 zfs文件写入过分分析 zfs系统架构 代码语言: javascript
复制 -------------------system calls---------------------------
| | kernel
-------------------
-----------| VFS |------------
| File ------------------- |
| Systems | | |
| ----------------------- |
| | ZFS | | | |
| | --------- | |
| | | ZIO | | |
| | --------- | |
| | | vdevs | | |
| ------ --------- ------ |
| | | |
--------------------------------------------
| |
----------------
| block device |
----------------
| |
------------
| SCSI |
------------
| |
------------
| SAS |
------------
| |
V |
.-----------.
`-----------'
| disk |
`-----------'
linux kernel sys_write :当应用程序执行write函数会触发sys_write系统调用,具体的系统调用的表参照https://filippo.io/linux-syscall-table/
vfs_write :vfs层提供统一的写接口,这里提供不同文件系统write的统一接口 do_sync_write : 同步写接口.其中sys_write/vfs_write/do_sync_write是内核提供的抽象的写接口,其中do_sync_write是内核4.x内核提供的函数,在5.x内核版本是new_sync_write函数.linux内核版本不同会导致部分系统函数有部分的差异。如下是参考linux kernel 5的内核代码分析 代码语言: javascript
复制 // libc提供的write的接口
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
// write函数的系统调用
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
ret = vfs_write(f.file, buf, count, ppos);
return ret;
}
// 调用实际文件系统的file的write_iter函数,到这里基本完成kernel内核层面的系统调用,接下来就是实际zfs的写函数
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
// struct file 的f_op是struct file_operations 指针,这个指针保存每个实际文件系统的文件操作的函数,这里需要查看zfs对应的file_opertions函数操作表
return file->f_op->write_iter(kio, iter);
}
zfs kernel zfs写数据过程分为两个阶段open context
和sync context
.open context
阶段通过系统调用数据从用户态拷贝到zfs的缓冲区同时zfs把这些脏数据缓存在DMU
中;sync context
阶段判断脏数据是否超过4G,如果超过则通过zio
批量把数据刷新到磁盘。DMU
写数据到ZIO
,在ARC
缓存特定的数据,通知DSL
层追踪空间的使用。 第一阶段open context
阶段,是从zfs_write开始。zfs_write分为一个block的全部写和部分写;整块写首先针对块加锁,然后读取,在更改的新数据关联新的buffer;如果是部分写,首先也是读取操作,更改block中的部分内容,标记为脏页. 代码语言: javascript
复制 // z_node代表zfs中的inode,zfs_uio_t 是偏移量和长度
// 函数是经过省略的部分。
int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
{
int error = 0;
ssize_t start_resid = zfs_uio_resid(uio);
ssize_t n = start_resid;
if (n == 0)
return (0);
zfsvfs_t *zfsvfs = ZTOZSB(zp);
const uint64_t max_blksz = zfsvfs->z_max_blksz;
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFAULT));
}
const rlim64_t limit = MAXOFFSET_T;
if (woff >= limit) {
zfs_rangelock_exit(lr);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFBIG));
}
if (n > limit - woff)
n = limit - woff;
uint64_t end_size = MAX(zp->z_size, woff n);
zilog_t *zilog = zfsvfs->z_log;
// 文件分割为多个文件chunk
while (n > 0) {
woff = zfs_uio_offset(uio);
arc_buf_t *abuf = NULL;
if (n >= max_blksz && woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes))) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT3S(cbytes, ==, max_blksz);
// 创建一个事务,表示准备更改操作,是否有空闲的空间
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
MIN(n, max_blksz));
// 等到喜爱一个可用的事务组,检查是否有足够空间和内存
error = dmu_tx_assign(tx, TXG_WAIT)
{
dmu_tx_wait(tx) {
dmu_tx_delay(tx, dirty);
}
}
if (error) {
dmu_tx_abort(tx);
if (abuf != NULL)
dmu_return_arcbuf(abuf);
break;
}
if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
const ssize_t nbytes =
MIN(n, max_blksz - P2PHASE(woff, max_blksz));
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
zfs_uio_fault_disable(uio, B_TRUE);
// 数据从uio写入到DMU
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
zfs_uio_fault_disable(uio, B_FALSE);
#ifdef __linux__
if (error == EFAULT) {
dmu_tx_commit(tx);
if (tx_bytes != zfs_uio_resid(uio))
n -= tx_bytes - zfs_uio_resid(uio);
if (zfs_uio_prefaultpages(MIN(n, max_blksz),
uio)) {
break;
}
continue;
}
#endif
if (error != 0) {
dmu_tx_commit(tx);
break;
}
tx_bytes -= zfs_uio_resid(uio);
} else {
error = dmu_assign_arcbuf_by_dbuf(
sa_get_db(zp->z_sa_hdl), woff, abuf, tx)
{
dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx) {
dbuf_assign_arcbuf(db, buf, tx)
{
// 标记脏的dbuf
(void) dbuf_dirty(db, tx);
}
}
}
if (error != 0) {
dmu_return_arcbuf(abuf);
dmu_tx_commit(tx);
break;
}
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
// zfs log写入
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
NULL, NULL);
dmu_tx_commit(tx);
}
if (ioflag & (O_SYNC | O_DSYNC) ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
// zfs intent log写入,更改数据写到磁盘之前必须先保证日志罗盘
zil_commit(zilog, zp->z_id);
ZFS_EXIT(zfsvfs);
return (0);
}
第二个阶段是sync context
阶段,zfs会启动内核线程来同步事务组,然后进入dsl
层的同步,在进入DMU
层的同步.最后是ZIO
的pipeline 代码语言: javascript
复制 void txg_sync_start(dsl_pool_t *dp)
{
tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,0)
{
static void txg_sync_thread(void *arg)
{
for (;;) {
clock_t timeout = zfs_txg_timeout * hz;
spa_sync(spa, txg);
// 找到脏的数据集,call dsl_datasetsync->dmu_objset_sync->zio pipeline
txg_dispatch_callbacks(dp, txg);
}
}
}
}