Skip to content

Commit

Permalink
Make ganging redundancy respect redundant_metadata property
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Dagnelie committed Feb 20, 2025
1 parent d7c89cf commit f51bd9b
Show file tree
Hide file tree
Showing 18 changed files with 185 additions and 28 deletions.
1 change: 1 addition & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ typedef struct dbuf_dirty_record {
arc_buf_t *dr_data;
override_states_t dr_override_state;
uint8_t dr_copies;
uint8_t dr_gang_copies;
boolean_t dr_nopwrite;
boolean_t dr_brtwrite;
boolean_t dr_diowrite;
Expand Down
3 changes: 2 additions & 1 deletion include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ typedef struct zio_prop {
uint8_t zp_complevel;
uint8_t zp_level;
uint8_t zp_copies;
uint8_t zp_gang_copies;
dmu_object_type_t zp_type;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
Expand Down Expand Up @@ -596,7 +597,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);

extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite, boolean_t brtwrite);
int gang_copies, boolean_t nopwrite, boolean_t brtwrite);

extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);

Expand Down
2 changes: 2 additions & 0 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6889,6 +6889,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
localprop.zp_nopwrite = B_FALSE;
localprop.zp_copies =
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
localprop.zp_gang_copies =
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (ARC_BUF_COMPRESSED(buf)) {
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -5351,8 +5351,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
dr->dt.dl.dr_brtwrite);
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
Expand Down
17 changes: 16 additions & 1 deletion module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1915,6 +1915,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;

/*
* Old style holes are filled with all zeros, whereas
Expand Down Expand Up @@ -2321,6 +2322,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
boolean_t dedup_verify = os->os_dedup_verify;
boolean_t encrypt = B_FALSE;
int copies = os->os_copies;
int gang_copies = os->os_copies;

/*
* We maintain different write policies for each of the following
Expand Down Expand Up @@ -2353,15 +2355,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
switch (os->os_redundant_metadata) {
case ZFS_REDUNDANT_METADATA_ALL:
copies++;
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_MOST:
if (level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
copies++;
if (level + 1 >=
zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
gang_copies++;
break;
case ZFS_REDUNDANT_METADATA_SOME:
if (DMU_OT_IS_CRITICAL(type))
if (DMU_OT_IS_CRITICAL(type)) {
copies++;
gang_copies++;
} else if (DMU_OT_IS_METADATA(type)) {
gang_copies++;
}
break;
case ZFS_REDUNDANT_METADATA_NONE:
break;
Expand Down Expand Up @@ -2435,6 +2446,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);

if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL)
gang_copies++;
}

/*
Expand Down Expand Up @@ -2468,6 +2482,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/dmu_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -2299,6 +2299,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
zp.zp_nopwrite = B_FALSE;
zp.zp_copies = MIN(zp.zp_copies,
SPA_DVAS_PER_BP - 1);
zp.zp_gang_copies =
MIN(zp.zp_gang_copies,
SPA_DVAS_PER_BP - 1);
}
zio_flags |= ZIO_FLAG_RAW;
} else if (DRR_WRITE_COMPRESSED(drrw)) {
Expand Down
26 changes: 15 additions & 11 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1333,8 +1333,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
}

void
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
boolean_t brtwrite)
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
boolean_t nopwrite, boolean_t brtwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
Expand All @@ -1351,6 +1351,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
zio->io_prop.zp_nopwrite = nopwrite;
zio->io_prop.zp_brtwrite = brtwrite;
zio->io_prop.zp_copies = copies;
zio->io_prop.zp_gang_copies = gang_copies;
zio->io_bp_override = bp;
}

Expand Down Expand Up @@ -3083,15 +3084,16 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);

/*
* If one copy was requested, store 2 copies of the GBH, so that we
* can still traverse all the data (e.g. to free or scrub) even if a
* block is damaged. Note that we can't store 3 copies of the GBH in
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
* Store multiple copies of the GBH, so that we can still traverse
* all the data (e.g. to free or scrub) even if a block is damaged.
* This value respects the redundant_metadata property. Note that
* we can't store 3 copies of the GBH in all cases, e.g. with
* encryption, which uses DVA[2] for the IV+salt.
*/
int gbh_copies = copies;
if (gbh_copies == 1) {
gbh_copies = MIN(2, spa_max_replication(spa));
}
int gbh_copies = MIN(gio->io_prop.zp_gang_copies,
MAX(spa_max_replication(spa) - 1, copies));
ASSERT3S(gbh_copies, >, 0);
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);

ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
Expand All @@ -3111,6 +3113,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* since metaslab_class_throttle_reserve() always allows
* additional reservations for gang blocks.
*/
ASSERT3U(gbh_copies, >=, copies);
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
pio->io_allocator, pio, flags));
}
Expand Down Expand Up @@ -3191,6 +3194,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
zp.zp_level = 0;
zp.zp_copies = gio->io_prop.zp_copies;
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
zp.zp_dedup = B_FALSE;
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
Expand Down Expand Up @@ -3997,7 +4001,7 @@ zio_ddt_write(zio_t *zio)
* grow the DDT entry by to satisfy the request.
*/
zio_prop_t czp = *zp;
czp.zp_copies = need_dvas;
czp.zp_copies = czp.zp_gang_copies = need_dvas;
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp,
zio_ddt_child_write_ready, NULL,
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ tags = ['functional', 'features', 'large_dnode']
[tests/functional/gang_blocks]
tests = ['gang_blocks_001_pos', 'gang_blocks_dyn_header_pos',
'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_degang',
'gang_blocks_dyn_multi']
'gang_blocks_dyn_multi', 'gang_blocks_redundant']
tags = ['functional', 'gang_blocks']

[tests/functional/grow]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1559,6 +1559,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \
functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \
functional/gang_blocks/gang_blocks_dyn_multi.ksh \
functional/gang_blocks/gang_blocks_redundant.ksh \
functional/gang_blocks/setup.ksh \
functional/grow/grow_pool_001_pos.ksh \
functional/grow/grow_replicas_001_pos.ksh \
Expand Down
3 changes: 3 additions & 0 deletions tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,7 @@

. $STF_SUITE/include/libtest.shlib

restore_tunable METASLAB_FORCE_GANGING
restore_tunable METASLAB_FORCE_GANGING_PCT
restore_tunable METASLAB_ALLOW_DEGANGING_PCT
default_cleanup
44 changes: 37 additions & 7 deletions tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,48 @@

#
# Get 0th DVA of first L0 block of file
#
# $1 filesystem
# $2 object number
#
function get_object_info
{
typeset fs=$1
typeset obj=$2

zdb -dddddd $fs $obj
}

#
# $1 filesystem
# $2 path to file
# $3 block filter
#
function get_first_block
function get_blocks_filter
{
typeset fs=$1
typeset path=$2

typeset full_path="$(get_prop mountpoint $fs)/$path"
typeset obj="$(ls -i $full_path | awk '{print $1}')"

typeset l0_line="$(zdb -ddddd $fs $obj | grep L0 | grep -v Dataset | head -n 1)"
echo $l0_line | sed 's/.*L0 \([^ ]*\).*/\1/'
get_object_info $fs $obj | grep $3 | grep -v Dataset
}

return 0
function get_first_block
{
get_blocks_filter $1 $2 L0 | head -n 1
}

function get_first_block_dva
{
get_first_block $1 $2 | sed 's/.*L0 \([^ ]*\).*/\1/'
}

# Takes a zdb compressed blkptr line on stdin
function get_num_dvas
{
sed 's/.*L[0-9] \(.*\) [a-f0-9]*L.*/\1/' | awk '{print NF}'
}

function check_gang_dva
Expand Down Expand Up @@ -80,9 +106,13 @@ function read_gang_header
zdb -R $pool "${dva%:*}:$size:g" 2>&1 | grep -v "Found vdev:"
}

save_tunable METASLAB_FORCE_GANGING
save_tunable METASLAB_FORCE_GANGING_PCT
save_tunable METASLAB_ALLOW_DEGANGING_PCT
function preamble
{
save_tunable METASLAB_FORCE_GANGING
save_tunable METASLAB_FORCE_GANGING_PCT
save_tunable METASLAB_ALLOW_DEGANGING_PCT
}

function cleanup
{
destroy_pool $TESTPOOL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

log_assert "Gang blocks behave correctly."

preamble
log_onexit cleanup

log_must zpool create -f -o feature@dynamic_gang_header=disabled $TESTPOOL $DISKS
Expand All @@ -40,7 +41,7 @@ set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=128k count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block $TESTPOOL/$TESTFS file)
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l)
[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

log_assert "Verify that deganging works correctly"

preamble
log_onexit cleanup

log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
Expand All @@ -42,7 +43,7 @@ set_tunable32 METASLAB_ALLOW_DEGANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block $TESTPOOL/$TESTFS file)
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
check_not_gang_dva $first_block

log_must verify_pool $TESTPOOL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

log_assert "Verify that we don't use large gang headers on small-ashift pools".

preamble
log_onexit cleanup

log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
Expand All @@ -40,7 +41,7 @@ set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block $TESTPOOL/$TESTFS file)
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 200)
gangs=$(echo "$leaves" | grep -c gang)
[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

log_assert "Verify that we don't use large gang headers on small-ashift pools".

preamble
log_onexit cleanup

log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
Expand All @@ -40,7 +41,7 @@ set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=1M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block $TESTPOOL/$TESTFS file)
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 1000 | grep -v HOLE)
first_dva=$(echo "$leaves" | head -n 1 | awk '{print $1}' | sed 's/.*<//' | sed 's/>.*//')
check_not_gang_dva $first_dva
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

log_assert "Verify that we can still multi-level gang with large headers."

preamble
log_onexit cleanup

log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS
Expand All @@ -40,7 +41,7 @@ set_tunable32 METASLAB_FORCE_GANGING_PCT 100
path="${mountpoint}/file"
log_must dd if=/dev/urandom of=$path bs=16M count=1
log_must zpool sync $TESTPOOL
first_block=$(get_first_block $TESTPOOL/$TESTFS file)
first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
leaves=$(read_gang_header $TESTPOOL $first_block 200)
gangs=$(echo "$leaves" | grep -c gang)
[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed"
Expand Down
Loading

0 comments on commit f51bd9b

Please sign in to comment.