Skip to content

Commit

Permalink
Prime arc to reduce zil_replay & import times
Browse files Browse the repository at this point in the history
The time it takes to import a zpool is dominated
by the time it take to replay the zfs intent log (zil)
when the zil is large. The zil is replayed serially,
and some operations require read-modify-write to occur,
for example TX_WRITE and TX_LINK entries. This commit
reduces zil_replay times by reading the zil and issuing
arc_read requests in parallel using a taskq prior to
performing the serial zil_replay. Doing so can reduce pool
import times from hours to minutes in cases where the zil
has many TX_WRITE and TX_LINK entries. The benefit is
particularly acute when the primary pool is stored on
high-latency devices, which increases the cost of pool
read-modify-write in serial zil_replay.

Signed-off-by: Mark Roper <[email protected]>
  • Loading branch information
markroper committed Feb 11, 2025
1 parent b8c73ab commit 776e3b3
Show file tree
Hide file tree
Showing 15 changed files with 245 additions and 20 deletions.
31 changes: 27 additions & 4 deletions cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,29 @@ ztest_record_enospc(const char *s)
ztest_shared->zs_enospc_count++;
}

static zfs_replay_prime_arc_func_t *ztest_replay_prime_vector[TX_MAX_TYPE] = {
NULL, /* 0 no such transaction type */
NULL, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
NULL, /* TX_REMOVE */
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
NULL, /* TX_WRITE */
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};

static uint64_t
ztest_get_ashift(void)
{
Expand Down Expand Up @@ -3010,7 +3033,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id)

/* zfsvfs_setup() */
VERIFY3P(zil_open(os, ztest_get_data, NULL), ==, zd->zd_zilog);
zil_replay(os, zd, ztest_replay_vector);
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);

(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
mutex_exit(&zd->zd_dirobj_lock);
Expand Down Expand Up @@ -4715,7 +4738,7 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
B_TRUE, FTAG, &os) == 0) {
ztest_zd_init(zdtmp, NULL, os);
zil_replay(os, zdtmp, ztest_replay_vector);
zil_replay(os, zdtmp, ztest_replay_vector, ztest_replay_prime_vector);

Check failure on line 4741 in cmd/ztest.c

View workflow job for this annotation

GitHub Actions / checkstyle

line > 80 characters
ztest_zd_fini(zdtmp);
dmu_objset_disown(os, B_TRUE, FTAG);
}
Expand Down Expand Up @@ -7836,7 +7859,7 @@ ztest_dataset_open(int d)

ztest_dataset_dirobj_verify(zd);

zil_replay(os, zd, ztest_replay_vector);
zil_replay(os, zd, ztest_replay_vector, ztest_replay_prime_vector);

ztest_dataset_dirobj_verify(zd);

Expand Down Expand Up @@ -7883,7 +7906,7 @@ ztest_replay_zil_cb(const char *name, void *arg)
zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);

ztest_zd_init(zdtmp, NULL, os);
zil_replay(os, zdtmp, ztest_replay_vector);
zil_replay(os, zdtmp, ztest_replay_vector, ztest_replay_prime_vector);
ztest_zd_fini(zdtmp);

if (dmu_objset_zil(os)->zl_parse_lr_count != 0 &&
Expand Down
2 changes: 0 additions & 2 deletions include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
extern void zfs_znode_free(struct znode *);

extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
char *buf, uint64_t buflen);

Expand Down
1 change: 0 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
extern void zfs_inode_destroy(struct inode *);
extern void zfs_mark_inode_dirty(struct inode *);
extern boolean_t zfs_relatime_need_update(const struct inode *);
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

#ifdef __cplusplus
}
Expand Down
2 changes: 2 additions & 0 deletions include/sys/dsl_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ extern uint_t zfs_dirty_data_max_percent;
extern uint_t zfs_dirty_data_max_max_percent;
extern uint_t zfs_delay_min_dirty_percent;
extern uint64_t zfs_delay_scale;
extern int zfs_zil_replay_prime_arc;

/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
Expand Down Expand Up @@ -137,6 +138,7 @@ typedef struct dsl_pool {
txg_list_t dp_early_sync_tasks;
taskq_t *dp_sync_taskq;
taskq_t *dp_zil_clean_taskq;
taskq_t *dp_zil_prime_taskq;

/*
* Protects administrative changes (properties, namespace)
Expand Down
14 changes: 13 additions & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -598,9 +598,21 @@ extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data,
zil_sums_t *zil_sums);
extern void zil_close(zilog_t *zilog);

extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
typedef void zfs_replay_prime_arc_func_t(void *args);
extern zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE];
typedef struct zil_replay_arg {
zil_replay_func_t *const *zr_replay;
zfs_replay_prime_arc_func_t *const *zr_replay_prime;
void *zr_arg;
boolean_t zr_byteswap;
char *zr_lr;
} zil_replay_arg_t;
extern boolean_t zil_replay(objset_t *os, void *arg,
zil_replay_func_t *const replay_func[TX_MAX_TYPE]);
zil_replay_func_t *const replay_func[TX_MAX_TYPE],
zfs_replay_prime_arc_func_t *const replay_prime_func[TX_MAX_TYPE]);
extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);

extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);

Expand Down
1 change: 1 addition & 0 deletions include/sys/zvol_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ extern krwlock_t zvol_state_lock;
extern struct hlist_head *zvol_htable;
#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)])
extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE];
extern zfs_replay_prime_arc_func_t *const zvol_replay_prime_vector[TX_MAX_TYPE];

extern unsigned int zvol_volmode;
extern unsigned int zvol_inhibit_dev;
Expand Down
9 changes: 9 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2397,6 +2397,15 @@ The default value of
.Sy 100%
will create a maximum of one thread per cpu.
.
.It Sy zfs_zil_replay_prime_arc Ns = Ns Sy 0 Ns | Ns 1 Pq int
Controls whether zil_replay will read the zil and in parallel issue
zfs_read to prime the arc cache prior to performing real zil replay,
which is serial. Priming before replay can reduce zpool_import latency
by reducing zil_replay time for high latency pools. It does this by
eliminating serial read-modify-write cycles in zil_replay.
The value of 1 will perform arc_priming prior to zil_replay.
The deafult value of 0 will not perform arc priming prior to zil_replay.
.
.It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
This sets the maximum block size used by the ZIL.
On very fragmented pools, lowering this
Expand Down
2 changes: 1 addition & 1 deletion module/os/freebsd/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
zfsvfs->z_use_namecache = B_FALSE;
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfs_replay_vector, zfs_replay_prime_vector);
zfsvfs->z_replay = B_FALSE;
zfsvfs->z_use_namecache = use_nc;
}
Expand Down
3 changes: 2 additions & 1 deletion module/os/freebsd/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1488,7 +1488,8 @@ zvol_os_create_minor(const char *name)
if (zil_replay_disable)
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
else
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
zvol_replay_prime_vector);
}
if (replayed_zil)
zil_close(zv->zv_zilog);
Expand Down
2 changes: 1 addition & 1 deletion module/os/linux/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
} else {
zfsvfs->z_replay = B_TRUE;
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfs_replay_vector, zfs_replay_prime_vector);
zfsvfs->z_replay = B_FALSE;
}
}
Expand Down
3 changes: 2 additions & 1 deletion module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1700,7 +1700,8 @@ zvol_os_create_minor(const char *name)
if (zil_replay_disable)
replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
else
replayed_zil = zil_replay(os, zv, zvol_replay_vector);
replayed_zil = zil_replay(os, zv, zvol_replay_vector,
zvol_replay_prime_vector);
}
if (replayed_zil)
zil_close(zv->zv_zilog);
Expand Down
5 changes: 5 additions & 0 deletions module/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
static int zfs_zil_clean_taskq_nthr_pct = 100;
static int zfs_zil_clean_taskq_minalloc = 1024;
static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
int zfs_zil_replay_prime_arc = 0;

int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
Expand Down Expand Up @@ -217,6 +218,10 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
zfs_zil_clean_taskq_maxalloc,
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);

dp->dp_zil_prime_taskq = taskq_create("dp_zil_prime_taskq",
100, minclsyspri, boot_ncpus, boot_ncpus * 2,
TASKQ_THREADS_CPU_PCT);

mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);

Expand Down
106 changes: 106 additions & 0 deletions module/zfs/zfs_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,39 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}

static void
zfs_replay_prime_link(void *args)
{
zil_replay_arg_t *zr = args;
lr_link_t *lr;
zfsvfs_t *zfsvfs;
znode_t *dzp, *zp;
boolean_t byteswap;
int error;

zfsvfs = (zfsvfs_t *)zr->zr_arg;
lr = (lr_link_t *)zr->zr_lr;
byteswap = (boolean_t)zr->zr_byteswap;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) {
cmn_err(CE_WARN, "Failed to get znode for link dir "
"during replay prime: %d", error);
return;
}

if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
cmn_err(CE_WARN, "Failed to get znode for link "
"during replay prime: %d", error);
zrele(dzp);
return;
}

zrele(zp);
zrele(dzp);
}

static int
do_zfs_replay_rename(zfsvfs_t *zfsvfs, _lr_rename_t *lr, char *sname,
char *tname, uint64_t rflags, vattr_t *wo_vap)
Expand Down Expand Up @@ -869,6 +902,52 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}

static void
zfs_replay_prime_write(void *args)
{
fstrans_cookie_t cookie;
zil_replay_arg_t *zr = args;
zfsvfs_t *zfsvfs;
lr_write_t *lr;
znode_t *zp;
uint64_t length;
uint64_t offset;
char *data;
struct iovec iov;
zfs_uio_t uio;
boolean_t byteswap;

zfsvfs = (zfsvfs_t *)zr->zr_arg;
lr = (lr_write_t *)zr->zr_lr;
byteswap = (boolean_t)zr->zr_byteswap;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));

length = lr->lr_length % zfsvfs->z_max_blksz;
if (length == 0)
goto read_task_done;

offset = lr->lr_offset + (lr->lr_length - length);
data = (char *)(lr + 1);
iov.iov_base = (void *)data;
iov.iov_len = length;
zfs_uio_iovec_init(&uio, &iov, 1, offset, UIO_SYSSPACE, length, 0);

if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
goto read_task_done;

cookie = spl_fstrans_mark();
// Call zfs_read with the provided arguments
zfs_read(zp, &uio, /* ioflags */ 0, kcred);
spl_fstrans_unmark(cookie);

// Free the allocated memory
zrele(zp);
read_task_done:
vmem_free(zr->zr_lr, sizeof (lr_write_t) + lr->lr_length);
kmem_free(zr, sizeof (zil_replay_arg_t));
}

/*
* TX_WRITE2 are only generated when dmu_sync() returns EALREADY
* meaning the pool block is already being synced. So now that we always write
Expand Down Expand Up @@ -1262,3 +1341,30 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
zfs_replay_clone_range, /* TX_CLONE_RANGE */
};

/*
* Callback vectors for priming the arc for zil records
*/
zfs_replay_prime_arc_func_t *const zfs_replay_prime_vector[TX_MAX_TYPE] = {
NULL, /* no such type */
NULL, /* TX_CREATE */
NULL, /* TX_MKDIR */
NULL, /* TX_MKXATTR */
NULL, /* TX_SYMLINK */
NULL, /* TX_REMOVE */
NULL, /* TX_RMDIR */
zfs_replay_prime_link, /* TX_LINK */
NULL, /* TX_RENAME */
zfs_replay_prime_write, /* TX_WRITE */
NULL, /* TX_TRUNCATE */
NULL, /* TX_SETATTR */
NULL, /* TX_ACL_V0 */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
NULL, /* TX_CREATE_ACL_ATTR */
NULL, /* TX_MKDIR_ACL */
NULL, /* TX_MKDIR_ATTR */
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
};
Loading

0 comments on commit 776e3b3

Please sign in to comment.