From 28f9c2acdcd0582eb2282b45c713ff0c974f3d08 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 17 Feb 2025 01:32:49 +0500 Subject: [PATCH] arc: avoid possible deadlock in arc_read In l2arc_evict(), the config lock may be acquired in reverse order (e.g., first the config lock (writer), then a hash lock) unlike in arc_read() during scenarios like L2ARC device removal. To avoid deadlocks, if the attempt to acquire the config lock (reader) fails in arc_read(), release the hash lock, wait for the config lock, and retry from the beginning. Signed-off-by: Ameer Hamza --- cmd/zdb/zdb.c | 4 ++-- include/sys/zio.h | 3 ++- module/zfs/arc.c | 26 ++++++++++++++++++++++---- module/zfs/dsl_scan.c | 2 +- module/zfs/spa.c | 2 +- module/zfs/zio.c | 19 ++++++++++++------- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5e8f282e96c3..fbc92833ebce 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -9043,7 +9043,7 @@ zdb_read_block(char *thing, spa_t *spa) const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -9052,7 +9052,7 @@ zdb_read_block(char *thing, spa_t *spa) b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (lsize == -1 || zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/include/sys/zio.h b/include/sys/zio.h index 46f5d68aed4a..68d4e323c474 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -546,6 +546,7 @@ enum blk_verify_flag { enum blk_config_flag { BLK_CONFIG_HELD, // SCL_VDEV held for writer BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV }; @@ -663,7 +664,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); -extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, +extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1f653d953113..e1251c030569 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5683,6 +5683,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; + boolean_t bp_validation = B_FALSE; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -5725,7 +5726,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, BLK_VERIFY_LOG)) { mutex_exit(hash_lock); rc = SET_ERROR(ECKSUM); @@ -5877,6 +5878,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + int config_lock; + int error; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) @@ -5885,16 +5888,31 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, goto done; } + if (zio_flags & ZIO_FLAG_CONFIG_WRITER) { + config_lock = BLK_CONFIG_HELD; + } else if (hash_lock != NULL) { + /* + * Prevent lock order reversal + */ + config_lock = BLK_CONFIG_NEEDED_TRY; + } else { + config_lock = BLK_CONFIG_NEEDED; + } + /* * Verify the block pointer contents are reasonable. This * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, - (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? - BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (!bp_validation && (error = zfs_blkptr_verify(spa, bp, + config_lock, BLK_VERIFY_LOG))) { if (hash_lock != NULL) mutex_exit(hash_lock); + if (error == EBUSY && !zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + bp_validation = B_TRUE; + goto top; + } rc = SET_ERROR(ECKSUM); goto done; } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 5977f8c82b45..35b56420511a 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2305,7 +2305,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, + } else if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bdeef0959da7..b4d1a00d05fb 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2778,7 +2778,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b071ac17ed1f..36e2f5e4bba8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1164,7 +1164,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * - * If everything checks out B_TRUE is returned. The zfs_blkptr_verify + * If everything checks out 0 is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: @@ -1179,7 +1179,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ -boolean_t +int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { @@ -1211,7 +1211,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } - return (errors == 0); + return (errors ? ECKSUM : 0); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, @@ -1229,7 +1229,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) - return (errors == 0); + return (errors ? ECKSUM : 0); switch (blk_config) { case BLK_CONFIG_HELD: @@ -1238,8 +1238,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; + case BLK_CONFIG_NEEDED_TRY: + if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER)) + return (EBUSY); + break; case BLK_CONFIG_SKIP: - return (errors == 0); + return (errors ? ECKSUM : 0); default: panic("invalid blk_config %u", blk_config); } @@ -1294,10 +1298,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, bp, i, (longlong_t)offset); } } - if (blk_config == BLK_CONFIG_NEEDED) + if (blk_config == BLK_CONFIG_NEEDED || blk_config == + BLK_CONFIG_NEEDED_TRY) spa_config_exit(spa, SCL_VDEV, bp); - return (errors == 0); + return (errors ? ECKSUM : 0); } boolean_t