From 9b6f015267cb181d3457c85731126e32ae632b2d Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 26 Feb 2025 00:32:12 +0500 Subject: [PATCH] arc: avoid possible deadlock in arc_read In l2arc_evict(), the config lock may be acquired in reverse order (e.g., first the config lock (writer), then a hash lock) unlike in arc_read() during scenarios like L2ARC device removal. To avoid deadlocks, if the attempt to acquire the config lock (reader) fails in arc_read(), release the hash lock, wait for the config lock, and retry from the beginning. Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #17071 --- cmd/zdb/zdb.c | 4 ++-- include/sys/zio.h | 3 ++- module/zfs/arc.c | 26 ++++++++++++++++++++++---- module/zfs/dsl_scan.c | 2 +- module/zfs/spa.c | 2 +- module/zfs/zio.c | 19 ++++++++++++------- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 8e3b6972ae04..009072952435 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8949,7 +8949,7 @@ zdb_read_block(char *thing, spa_t *spa) const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -8958,7 +8958,7 @@ zdb_read_block(char *thing, spa_t *spa) b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (lsize == -1 || zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/include/sys/zio.h b/include/sys/zio.h index 5f27ae7541dd..b6f388c7e062 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -544,6 +544,7 @@ enum blk_verify_flag { enum blk_config_flag { BLK_CONFIG_HELD, // SCL_VDEV held for writer BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV }; @@ -660,7 +661,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); -extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, +extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 625406af31b6..66a0bbde5ec3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5572,6 +5572,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; + boolean_t bp_validation = B_FALSE; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -5614,7 +5615,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, BLK_VERIFY_LOG)) { mutex_exit(hash_lock); rc = SET_ERROR(ECKSUM); @@ -5766,6 +5767,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + int config_lock; + int error; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) @@ -5774,16 +5777,31 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, goto done; } + if (zio_flags & ZIO_FLAG_CONFIG_WRITER) { + config_lock = BLK_CONFIG_HELD; + } else if (hash_lock != NULL) { + /* + * Prevent lock order reversal + */ + config_lock = BLK_CONFIG_NEEDED_TRY; + } else { + config_lock = BLK_CONFIG_NEEDED; + } + /* * Verify the block pointer contents are reasonable. This * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, - (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? - BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (!bp_validation && (error = zfs_blkptr_verify(spa, bp, + config_lock, BLK_VERIFY_LOG))) { if (hash_lock != NULL) mutex_exit(hash_lock); + if (error == EBUSY && !zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + bp_validation = B_TRUE; + goto top; + } rc = SET_ERROR(ECKSUM); goto done; } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 9d040e146308..2abd11f58f85 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2273,7 +2273,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, + } else if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1a68a0953565..136142d14225 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2774,7 +2774,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 57b2b817d212..4b18f6e1c82e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1092,7 +1092,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * - * If everything checks out B_TRUE is returned. The zfs_blkptr_verify + * If everything checks out 0 is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: @@ -1107,7 +1107,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ -boolean_t +int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { @@ -1139,7 +1139,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } - return (errors == 0); + return (errors ? ECKSUM : 0); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, @@ -1157,7 +1157,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) - return (errors == 0); + return (errors ? ECKSUM : 0); switch (blk_config) { case BLK_CONFIG_HELD: @@ -1166,8 +1166,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; + case BLK_CONFIG_NEEDED_TRY: + if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER)) + return (EBUSY); + break; case BLK_CONFIG_SKIP: - return (errors == 0); + return (errors ? ECKSUM : 0); default: panic("invalid blk_config %u", blk_config); } @@ -1222,10 +1226,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, bp, i, (longlong_t)offset); } } - if (blk_config == BLK_CONFIG_NEEDED) + if (blk_config == BLK_CONFIG_NEEDED || blk_config == + BLK_CONFIG_NEEDED_TRY) spa_config_exit(spa, SCL_VDEV, bp); - return (errors == 0); + return (errors ? ECKSUM : 0); } boolean_t