Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prime arc to reduce zil_replay & import times #17044

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,29 @@ ztest_record_enospc(const char *s)
ztest_shared->zs_enospc_count++;
}

static zfs_replay_prime_arc_func_t *ztest_replay_prime_vector[TX_MAX_TYPE] = {
NULL, /* 0 no such transaction type */
NULL, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
NULL, /* TX_REMOVE */
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
NULL, /* TX_WRITE */
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};

static uint64_t
ztest_get_ashift(void)
{
Expand Down Expand Up @@ -3010,7 +3033,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)

/* zfsvfs_setup() */
VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog);
zil_replay(os, zd, ztest_replay_vector);
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);

(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
mutex_exit(&zd->zd_dirobj_lock);
Expand Down Expand Up @@ -4715,7 +4738,8 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
B_TRUE, FTAG, &os) == 0) {
ztest_zd_init(zdtmp, NULL, os);
zil_replay(os, zdtmp, ztest_replay_vector);
zil_replay(os, zdtmp, ztest_replay_vector,
ztest_replay_prime_vector);
ztest_zd_fini(zdtmp);
dmu_objset_disown(os, B_TRUE, FTAG);
}
Expand Down Expand Up @@ -7836,7 +7860,7 @@ ztest_dataset_open(int d)

ztest_dataset_dirobj_verify(zd);

zil_replay(os, zd, ztest_replay_vector);
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);

ztest_dataset_dirobj_verify(zd);

Expand Down Expand Up @@ -7883,7 +7907,7 @@ ztest_replay_zil_cb(const char *name, void *arg)
zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);

ztest_zd_init(zdtmp, NULL, os);
zil_replay(os, zdtmp, ztest_replay_vector);
zil_replay(os, zdtmp, ztest_replay_vector, ztest_replay_prime_vector);
ztest_zd_fini(zdtmp);

if (dmu_objset_zil(os)->zl_parse_lr_count != 0 &&
Expand Down
2 changes: 0 additions & 2 deletions include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
extern void zfs_znode_free(struct znode *);

extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
char *buf, uint64_t buflen);

Expand Down
1 change: 0 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
extern void zfs_inode_destroy(struct inode *);
extern void zfs_mark_inode_dirty(struct inode *);
extern boolean_t zfs_relatime_need_update(const struct inode *);
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

#ifdef __cplusplus
}
Expand Down
2 changes: 2 additions & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ extern uint_t zfs_dirty_data_max_percent;
extern uint_t zfs_dirty_data_max_max_percent;
extern uint_t zfs_delay_min_dirty_percent;
extern uint64_t zfs_delay_scale;
extern int zfs_zil_replay_prime_arc;

/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
Expand Down Expand Up @@ -137,6 +138,7 @@ typedef struct dsl_pool {
txg_list_t dp_early_sync_tasks;
taskq_t *dp_sync_taskq;
taskq_t *dp_zil_clean_taskq;
taskq_t *dp_zil_prime_taskq;

/*
* Protects administrative changes (properties, namespace)
Expand Down
14 changes: 13 additions & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -598,9 +598,21 @@ extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data,
zil_sums_t *zil_sums);
extern void zil_close(zilog_t *zilog);

extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
typedef void zfs_replay_prime_arc_func_t(void *args);
extern zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE];
typedef struct zil_replay_arg {
zil_replay_func_t *const *zr_replay;
zfs_replay_prime_arc_func_t *const *zr_replay_prime;
void *zr_arg;
boolean_t zr_byteswap;
char *zr_lr;
} zil_replay_arg_t;
extern boolean_t zil_replay(objset_t *os, void *arg,
zil_replay_func_t *const replay_func[TX_MAX_TYPE]);
zil_replay_func_t *const replay_func[TX_MAX_TYPE],
zfs_replay_prime_arc_func_t *const replay_prime_func[TX_MAX_TYPE]);
extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);

extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);

Expand Down
1 change: 1 addition & 0 deletions include/sys/zvol_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ extern krwlock_t zvol_state_lock;
extern struct hlist_head *zvol_htable;
#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE];
extern zfs_replay_prime_arc_func_t *const zvol_replay_prime_vector[TX_MAX_TYPE];

extern unsigned int zvol_volmode;
extern unsigned int zvol_inhibit_dev;
Expand Down
9 changes: 9 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2397,6 +2397,15 @@ The default value of
.Sy 100%
will create a maximum of one thread per cpu.
.
.It Sy zfs_zil_replay_prime_arc Ns = Ns Sy 0 Ns | Ns 1 Pq int
Controls whether zil_replay will read the zil and in parallel issue
zfs_read to prime the arc cache prior to performing real zil replay,
which is serial. Priming before replay can reduce zpool_import latency
by reducing zil_replay time for high latency pools. It does this by
eliminating serial read-modify-write cycles in zil_replay.
The value of 1 will perform arc_priming prior to zil_replay.
The deafult value of 0 will not perform arc priming prior to zil_replay.
.
.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
This sets the maximum block size used by the ZIL.
On very fragmented pools, lowering this
Expand Down
Binary file added module/.tmp_23387/tmp
Binary file not shown.
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
zfsvfs->z_use_namecache = B_FALSE;
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfs_replay_vector, zfs_replay_prime_vector);
zfsvfs->z_replay = B_FALSE;
zfsvfs->z_use_namecache = use_nc;
}
Expand Down
3 changes: 2 additions & 1 deletion module/os/freebsd/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,8 @@ zvol_os_create_minor(const char *name)
if (zil_replay_disable)
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
else
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
zvol_replay_prime_vector);
}
if (replayed_zil)
zil_close(zv->zv_zilog);
Expand Down
2 changes: 1 addition & 1 deletion module/os/linux/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
} else {
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfs_replay_vector, zfs_replay_prime_vector);
zfsvfs->z_replay = B_FALSE;
}
}
Expand Down
3 changes: 2 additions & 1 deletion module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1700,7 +1700,8 @@ zvol_os_create_minor(const char *name)
if (zil_replay_disable)
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
else
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
zvol_replay_prime_vector);
}
if (replayed_zil)
zil_close(zv->zv_zilog);
Expand Down
5 changes: 5 additions & 0 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
static int zfs_zil_clean_taskq_nthr_pct = 100;
static int zfs_zil_clean_taskq_minalloc = 1024;
static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
int zfs_zil_replay_prime_arc = 0;

int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
Expand Down Expand Up @@ -217,6 +218,10 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
zfs_zil_clean_taskq_maxalloc,
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);

dp->dp_zil_prime_taskq = taskq_create("dp_zil_prime_taskq",
100, minclsyspri, boot_ncpus, boot_ncpus * 2,
TASKQ_THREADS_CPU_PCT);

mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);

Expand Down
106 changes: 106 additions & 0 deletions module/zfs/zfs_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,39 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}

static void
zfs_replay_prime_link(void *args)
{
zil_replay_arg_t *zr = args;
lr_link_t *lr;
zfsvfs_t *zfsvfs;
znode_t *dzp, *zp;
boolean_t byteswap;
int error;

zfsvfs = (zfsvfs_t *)zr->zr_arg;
lr = (lr_link_t *)zr->zr_lr;
byteswap = (boolean_t)zr->zr_byteswap;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) {
cmn_err(CE_WARN, "Failed to get znode for link dir "
"during replay prime: %d", error);
return;
}

if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
cmn_err(CE_WARN, "Failed to get znode for link "
"during replay prime: %d", error);
zrele(dzp);
return;
}

zrele(zp);
zrele(dzp);
}

static int
do_zfs_replay_rename(zfsvfs_t *zfsvfs, _lr_rename_t *lr, char *sname,
char *tname, uint64_t rflags, vattr_t *wo_vap)
Expand Down Expand Up @@ -869,6 +902,52 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}

static void
zfs_replay_prime_write(void *args)
{
fstrans_cookie_t cookie;
zil_replay_arg_t *zr = args;
zfsvfs_t *zfsvfs;
lr_write_t *lr;
znode_t *zp;
uint64_t length;
uint64_t offset;
char *data;
struct iovec iov;
zfs_uio_t uio;
boolean_t byteswap;

zfsvfs = (zfsvfs_t *)zr->zr_arg;
lr = (lr_write_t *)zr->zr_lr;
byteswap = (boolean_t)zr->zr_byteswap;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

length = lr->lr_length % zfsvfs->z_max_blksz;
if (length == 0)
goto read_task_done;

offset = lr->lr_offset + (lr->lr_length - length);
data = (char *)(lr + 1);
iov.iov_base = (void *)data;
iov.iov_len = length;
zfs_uio_iovec_init(&uio, &iov, 1, offset, UIO_SYSSPACE, length, 0);

if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
goto read_task_done;

cookie = spl_fstrans_mark();
// Call zfs_read with the provided arguments
zfs_read(zp, &uio, /* ioflags */ 0, kcred);
spl_fstrans_unmark(cookie);

// Free the allocated memory
zrele(zp);
read_task_done:
vmem_free(zr->zr_lr, sizeof (lr_write_t) + lr->lr_length);
kmem_free(zr, sizeof (zil_replay_arg_t));
}

/*
* TX_WRITE2 are only generated when dmu_sync() returns EALREADY
* meaning the pool block is already being synced. So now that we always write
Expand Down Expand Up @@ -1262,3 +1341,30 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
zfs_replay_clone_range, /* TX_CLONE_RANGE */
};

/*
* Callback vectors for priming the arc for zil records
*/
zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE] = {
NULL, /* no such type */
NULL, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
NULL, /* TX_REMOVE */
NULL, /* TX_RMDIR */
zfs_replay_prime_link, /* TX_LINK */
NULL, /* TX_RENAME */
zfs_replay_prime_write, /* TX_WRITE */
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL_V0 */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};
Loading
Loading