Skip to content

Commit

Permalink
Make ganging redundancy respect redundant_metadata property
Browse files Browse the repository at this point in the history
The redundant_metadata setting in ZFS allows users to trade resilience for
performance and space savings. This applies to all data and metadata blocks in
zfs, with one exception: gang blocks. Gang blocks currently just take the
copies property of the IO being ganged and, if it's 1, sets it to 2. This
means that we always make at least two copies of a gang header, which is good
for resilience. However, if the users care more about performance than
resilience, their gang blocks will be even more of a penalty than usual.

We add logic to calculate the number of gang headers copies directly, and
store it as a separate IO property. This is stored in the IO properties and
not calculated when we decide to gang because by that point we may not have
easy access to the relevant information about what kind of block is being
stored. We also check the redundant_metadata property when doing so, and use
that to decide whether to store an extra copy of the gang headers, compared to
the underlying blocks.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Paul Dagnelie <[email protected]>
  • Loading branch information
Paul Dagnelie committed Feb 25, 2025
1 parent 523e3ad commit e2e7766
Show file tree
Hide file tree
Showing 15 changed files with 329 additions and 20 deletions.
12 changes: 7 additions & 5 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -2542,12 +2542,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,

blkbuf[0] = '\0';

for (i = 0; i < ndvas; i++)
for (i = 0; i < ndvas; i++) {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
(u_longlong_t)DVA_GET_ASIZE(&dva[i]),
(DVA_GET_GANG(&dva[i]) ? "G" : ""));
}

if (BP_IS_HOLE(bp)) {
(void) snprintf(blkbuf + strlen(blkbuf),
Expand Down Expand Up @@ -8978,7 +8980,7 @@ zdb_read_block(char *thing, spa_t *spa)

DVA_SET_VDEV(&dva[0], vd->vdev_id);
DVA_SET_OFFSET(&dva[0], offset);
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
DVA_SET_GANG(&dva[0], 0);
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));

BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
Expand All @@ -8993,7 +8995,7 @@ zdb_read_block(char *thing, spa_t *spa)
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);

spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
zio = zio_root(spa, NULL, NULL, 0);
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

if (vd == vd->vdev_top) {
/*
Expand Down
1 change: 1 addition & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ typedef struct dbuf_dirty_record {
arc_buf_t *dr_data;
override_states_t dr_override_state;
uint8_t dr_copies;
uint8_t dr_gang_copies;
boolean_t dr_nopwrite;
boolean_t dr_brtwrite;
boolean_t dr_diowrite;
Expand Down
3 changes: 2 additions & 1 deletion include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ typedef struct zio_prop {
uint8_t zp_complevel;
uint8_t zp_level;
uint8_t zp_copies;
uint8_t zp_gang_copies;
dmu_object_type_t zp_type;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
Expand Down Expand Up @@ -573,7 +574,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);

extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite, boolean_t brtwrite);
int gang_copies, boolean_t nopwrite, boolean_t brtwrite);

extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);

Expand Down
2 changes: 2 additions & 0 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6889,6 +6889,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
localprop.zp_nopwrite = B_FALSE;
localprop.zp_copies =
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
localprop.zp_gang_copies =
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (ARC_BUF_COMPRESSED(buf)) {
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -5351,8 +5351,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
dr->dt.dl.dr_brtwrite);
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
Expand Down
21 changes: 20 additions & 1 deletion module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1915,6 +1915,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;

/*
* Old style holes are filled with all zeros, whereas
Expand Down Expand Up @@ -2321,6 +2322,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup_verify = os->os_dedup_verify;
boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
int gang_copies = os->os_copies;

/*
* We maintain different write policies for each of the following
Expand Down Expand Up @@ -2353,15 +2355,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
switch (os->os_redundant_metadata) {
case ZFS_REDUNDANT_METADATA_ALL:
copies++;
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_MOST:
if (level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
copies++;
if (level + 1 >=
zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_SOME:
if (DMU_OT_IS_CRITICAL(type))
if (DMU_OT_IS_CRITICAL(type)) {
copies++;
gang_copies++;
} else if (DMU_OT_IS_METADATA(type)) {
gang_copies++;
}
break;
case ZFS_REDUNDANT_METADATA_NONE:
break;
Expand Down Expand Up @@ -2435,6 +2446,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);

if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
(os->os_redundant_metadata ==
ZFS_REDUNDANT_METADATA_MOST &&
zfs_redundant_metadata_most_ditto_level <= 1))
gang_copies++;
}

/*
Expand All @@ -2451,6 +2468,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)

if (DMU_OT_IS_ENCRYPTED(type)) {
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
nopwrite = B_FALSE;
} else {
dedup = B_FALSE;
Expand All @@ -2468,6 +2486,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/dmu_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -2299,6 +2299,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
zp.zp_nopwrite = B_FALSE;
zp.zp_copies = MIN(zp.zp_copies,
SPA_DVAS_PER_BP - 1);
zp.zp_gang_copies =
MIN(zp.zp_gang_copies,
SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (DRR_WRITE_COMPRESSED(drrw)) {
Expand Down
25 changes: 14 additions & 11 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1399,8 +1399,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
}

void
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
boolean_t brtwrite)
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
boolean_t nopwrite, boolean_t brtwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
Expand All @@ -1417,6 +1417,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_brtwrite = brtwrite;
zio->io_prop.zp_copies = copies;
zio->io_prop.zp_gang_copies = gang_copies;
zio->io_bp_override = bp;
}

Expand Down Expand Up @@ -3125,15 +3126,15 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);

/*
* If one copy was requested, store 2 copies of the GBH, so that we
* can still traverse all the data (e.g. to free or scrub) even if a
* block is damaged. Note that we can't store 3 copies of the GBH in
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
* Store multiple copies of the GBH, so that we can still traverse
* all the data (e.g. to free or scrub) even if a block is damaged.
* This value respects the redundant_metadata property. Note that
* we can't store 3 copies of the GBH in all cases, e.g. with
* encryption, which uses DVA[2] for the IV+salt.
*/
int gbh_copies = copies;
if (gbh_copies == 1) {
gbh_copies = MIN(2, spa_max_replication(spa));
}
int gbh_copies = gio->io_prop.zp_gang_copies;
ASSERT3S(gbh_copies, >, 0);
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);

ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
Expand All @@ -3153,6 +3154,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* since metaslab_class_throttle_reserve() always allows
* additional reservations for gang blocks.
*/
ASSERT3U(gbh_copies, >=, copies);
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
pio->io_allocator, pio, flags));
}
Expand Down Expand Up @@ -3215,6 +3217,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
Expand Down Expand Up @@ -3929,7 +3932,7 @@ zio_ddt_write(zio_t *zio)
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = need_dvas;
czp.zp_copies = czp.zp_gang_copies = need_dvas;
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL,
Expand Down
4 changes: 4 additions & 0 deletions tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,10 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
tags = ['functional', 'features', 'large_dnode']

[tests/functional/gang_blocks]
tests = ['gang_blocks_redundant']
tags = ['functional', 'gang_blocks']

[tests/functional/grow]
pre =
post =
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/include/tunables.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting
MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds
METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load
METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging
METASLAB_FORCE_GANGING_PCT metaslab.force_ganging_pct metaslab_force_ganging_pct
MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals
MULTIHOST_HISTORY multihost.history zfs_multihost_history
MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals
Expand Down
4 changes: 4 additions & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
functional/events/events.cfg \
functional/events/events_common.kshlib \
functional/fault/fault.cfg \
functional/gang_blocks/gang_blocks.kshlib \
functional/grow/grow.cfg \
functional/history/history.cfg \
functional/history/history_common.kshlib \
Expand Down Expand Up @@ -1552,6 +1553,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/features/large_dnode/large_dnode_008_pos.ksh \
functional/features/large_dnode/large_dnode_009_pos.ksh \
functional/features/large_dnode/setup.ksh \
functional/gang_blocks/cleanup.ksh \
functional/gang_blocks/gang_blocks_redundant.ksh \
functional/gang_blocks/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
functional/grow/grow_replicas_001_pos.ksh \
functional/history/cleanup.ksh \
Expand Down
31 changes: 31 additions & 0 deletions tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2025 by Klara Inc.
#

. $STF_SUITE/include/libtest.shlib

restore_tunable METASLAB_FORCE_GANGING
restore_tunable METASLAB_FORCE_GANGING_PCT
default_cleanup
Loading

0 comments on commit e2e7766

Please sign in to comment.