From eb6a7d11a003d275498ac0043577a0c324fd37d3 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Thu, 9 Jan 2025 19:42:39 +0000 Subject: [PATCH] DAOS-16982 csum: recalculate checksum on retrying This PR fixes retry logic by actually recalculating the checksum; also it removes the code that incorrectly records nvme error. Run-GHA: true Change-Id: Ib0287851fea4d125eecda48c5ccb3c73ed85b8f8 Signed-off-by: Jinshan Xiong --- src/object/cli_csum.h | 3 +- src/object/cli_obj.c | 66 +++++++++++++++++++-------------- src/object/srv_obj.c | 14 ++----- src/tests/suite/daos_checksum.c | 28 +++++++++++--- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h index 566c707054d..108a538f212 100644 --- a/src/object/cli_csum.h +++ b/src/object/cli_csum.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2023 Intel Corporation. + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -11,7 +12,7 @@ #include #include "obj_internal.h" -/** How many times to retry UPDATE RPCs on checksum error */ +/** How many times to retry FETCH and UPDATE RPCs on checksum error */ #define MAX_CSUM_RETRY 10 int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 75d661d0665..96be636ddc9 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -4758,38 +4759,37 @@ obj_comp_cb(tse_task_t *task, void *data) if (task->dt_result == -DER_CSUM || task->dt_result == -DER_TX_UNCERTAIN || task->dt_result == -DER_NVME_IO) { - if (!obj_auxi->spec_shard && !obj_auxi->spec_group && - !obj_auxi->no_retry && !obj_auxi->ec_wait_recov) { + if (!obj_auxi->spec_shard && !obj_auxi->spec_group && !obj_auxi->no_retry && + !obj_auxi->ec_wait_recov) { /* Retry fetch on alternative shard */ - if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) { - if (task->dt_result == -DER_CSUM) + if ((obj_auxi->opc == DAOS_OBJ_RPC_FETCH || + obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) && + task->dt_result == -DER_CSUM) { + struct shard_rw_args *rw_arg = &obj_auxi->rw_args; + + /* Retry a few times on checksum error; something must + * go terribly wrong if checksum happened for 10 times in a + * row therefore we stop trying anyways even though there + * may exist more replicas. */ + if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) { obj_auxi->csum_retry = 1; - else if (task->dt_result == -DER_TX_UNCERTAIN) + rw_arg->csum_retry_cnt++; + D_DEBUG(DB_IO, DF_OID " checksum error, retrying\n", + DP_OID(obj->cob_md.omd_id)); + } else { + D_ERROR(DF_OID + " too many retries on checksum error. " + "Failing I/O\n", + DP_OID(obj->cob_md.omd_id)); + obj_auxi->io_retry = 0; + } + } + + if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) { + if (task->dt_result == -DER_TX_UNCERTAIN) obj_auxi->tx_uncertain = 1; else obj_auxi->nvme_io_err = 1; - } else { - if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE && - task->dt_result == -DER_CSUM) { - struct shard_rw_args *rw_arg = &obj_auxi->rw_args; - - /** Retry a few times on checksum error on update */ - if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) { - obj_auxi->csum_retry = 1; - rw_arg->csum_retry_cnt++; - D_DEBUG(DB_IO, DF_OID" checksum error on " - "update, retrying\n", - DP_OID(obj->cob_md.omd_id)); - } else { - D_ERROR(DF_OID" checksum error on update, " - "too many retries. Failing I/O\n", - DP_OID(obj->cob_md.omd_id)); - obj_auxi->io_retry = 0; - } - } else if (task->dt_result != -DER_NVME_IO) { - /* Don't retry update for UNCERTAIN errors */ - obj_auxi->io_retry = 0; - } } } else { obj_auxi->io_retry = 0; @@ -5140,6 +5140,12 @@ obj_csum_update(struct dc_object *obj, daos_obj_update_t *args, struct obj_auxi_ if (!obj_csum_dedup_candidate(&obj->cob_co->dc_props, args->iods, args->nr)) return 0; + if (obj_auxi->csum_retry) { + /* Release old checksum result and prepare for new calculation */ + daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum); + daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums); + } + return dc_obj_csum_update(obj->cob_co->dc_csummer, obj->cob_co->dc_props, obj->cob_md.omd_id, args->dkey, args->iods, args->sgls, args->nr, obj_auxi->reasb_req.orr_singv_los, &obj_auxi->rw_args.dkey_csum, @@ -5150,6 +5156,12 @@ static int obj_csum_fetch(const struct dc_object *obj, daos_obj_fetch_t *args, struct obj_auxi_args *obj_auxi) { + if (obj_auxi->csum_retry) { + /* Release old checksum result and prepare for new calculation */ + daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum); + daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums); + } + return dc_obj_csum_fetch(obj->cob_co->dc_csummer, args->dkey, args->iods, args->sgls, args->nr, obj_auxi->reasb_req.orr_singv_los, &obj_auxi->rw_args.dkey_csum, &obj_auxi->rw_args.iod_csums); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 6b05b95d329..39b120ffb05 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -1,6 +1,7 @@ /** * (C) Copyright 2016-2024 Intel Corporation. * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -1371,11 +1372,8 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io orw->orw_dkey_csum, &orw->orw_iod_array, &orw->orw_oid); if (rc != 0) { - D_ERROR(DF_C_UOID_DKEY"verify_keys error: "DF_RC"\n", - DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey), - DP_RC(rc)); - if (rc == -DER_CSUM) - obj_log_csum_err(); + D_ERROR(DF_C_UOID_DKEY "verify_keys error: " DF_RC "\n", + DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey), DP_RC(rc)); return rc; } @@ -4598,12 +4596,8 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp rc = csum_verify_keys(ioc->ioc_coc->sc_csummer, &dcsr->dcsr_dkey, dcu->dcu_dkey_csum, &dcu->dcu_iod_array, &dcsr->dcsr_oid); - if (rc != 0) { - if (rc == -DER_CSUM) - obj_log_csum_err(); - + if (rc != 0) goto out; - } if (iohs == NULL) { D_ALLOC_ARRAY(iohs, dcde->dcde_write_cnt); diff --git a/src/tests/suite/daos_checksum.c b/src/tests/suite/daos_checksum.c index dfbacad5d0b..85b9e0dd551 100644 --- a/src/tests/suite/daos_checksum.c +++ b/src/tests/suite/daos_checksum.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -17,6 +18,7 @@ #include #define EC_CSUM_OC OC_EC_2P2G1 +#define RP_CSUM_OC OC_RP_2G1 static void iov_update_fill(d_iov_t *iov, char *data, uint64_t len_to_fill); @@ -45,7 +47,14 @@ csum_ec_enable(void **state) } static inline int -csum_replia_enable(void **state) +csum_replica_enable(void **state) +{ + dts_csum_oc = RP_CSUM_OC; + return 0; +} + +static inline int +csum_shard_enable(void **state) { dts_csum_oc = OC_SX; return 0; @@ -2781,10 +2790,18 @@ setup(void **state) #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) -#define CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \ - csum_replia_enable, test_case_teardown } -#define EC_CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \ - csum_ec_enable, test_case_teardown } +#define CSUM_TEST(dsc, test) \ + { \ + STR(__COUNTER__) ". " dsc, test, csum_shard_enable, test_case_teardown \ + } +#define EC_CSUM_TEST(dsc, test) \ + { \ + STR(__COUNTER__) ". " dsc, test, csum_ec_enable, test_case_teardown \ + } +#define RP_CSUM_TEST(dsc, test) \ + { \ + STR(__COUNTER__) ". " dsc, test, csum_replica_enable, test_case_teardown \ + } static const struct CMUnitTest csum_tests[] = { CSUM_TEST("DAOS_CSUM00: csum disabled", checksum_disabled), @@ -2815,6 +2832,7 @@ static const struct CMUnitTest csum_tests[] = { CSUM_TEST("DAOS_CSUM07: Mix of Single Value and Array values iods", mix_test), CSUM_TEST("DAOS_CSUM08: Update/Fetch A Key", test_update_fetch_a_key), CSUM_TEST("DAOS_CSUM09: Update/Fetch D Key", test_update_fetch_d_key), + RP_CSUM_TEST("DAOS_CSUM09.2: Update/Fetch D Key", test_update_fetch_d_key), CSUM_TEST("DAOS_CSUM10: Enumerate A Keys", test_enumerate_a_key), CSUM_TEST("DAOS_CSUM11: Enumerate D Keys", test_enumerate_d_key), CSUM_TEST("DAOS_CSUM12: Enumerate objects", test_enumerate_object),