Skip to content

Commit

Permalink
DAOS-16982 csum: recalculate checksum on retrying
Browse files Browse the repository at this point in the history
This PR fixes retry logic by actually recalculating the checksum; also
it removes the code that incorrectly records nvme error.

Run-GHA: true

Change-Id: Ib0287851fea4d125eecda48c5ccb3c73ed85b8f8
Signed-off-by: Jinshan Xiong <[email protected]>
  • Loading branch information
jxiong committed Feb 5, 2025
1 parent 8fe77b9 commit eb6a7d1
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 43 deletions.
3 changes: 2 additions & 1 deletion src/object/cli_csum.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2023 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -11,7 +12,7 @@
#include <daos/cont_props.h>
#include "obj_internal.h"

/** How many times to retry UPDATE RPCs on checksum error */
/** How many times to retry FETCH and UPDATE RPCs on checksum error */
#define MAX_CSUM_RETRY 10

int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param,
Expand Down
66 changes: 39 additions & 27 deletions src/object/cli_obj.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -4758,38 +4759,37 @@ obj_comp_cb(tse_task_t *task, void *data)

if (task->dt_result == -DER_CSUM || task->dt_result == -DER_TX_UNCERTAIN ||
task->dt_result == -DER_NVME_IO) {
if (!obj_auxi->spec_shard && !obj_auxi->spec_group &&
!obj_auxi->no_retry && !obj_auxi->ec_wait_recov) {
if (!obj_auxi->spec_shard && !obj_auxi->spec_group && !obj_auxi->no_retry &&
!obj_auxi->ec_wait_recov) {
/* Retry fetch on alternative shard */
if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) {
if (task->dt_result == -DER_CSUM)
if ((obj_auxi->opc == DAOS_OBJ_RPC_FETCH ||
obj_auxi->opc == DAOS_OBJ_RPC_UPDATE) &&
task->dt_result == -DER_CSUM) {
struct shard_rw_args *rw_arg = &obj_auxi->rw_args;

/* Retry a few times on checksum error; something must
* go terribly wrong if checksum happened for 10 times in a
* row therefore we stop trying anyways even though there
* may exist more replicas. */
if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
obj_auxi->csum_retry = 1;
else if (task->dt_result == -DER_TX_UNCERTAIN)
rw_arg->csum_retry_cnt++;
D_DEBUG(DB_IO, DF_OID " checksum error, retrying\n",
DP_OID(obj->cob_md.omd_id));
} else {
D_ERROR(DF_OID
" too many retries on checksum error. "
"Failing I/O\n",
DP_OID(obj->cob_md.omd_id));
obj_auxi->io_retry = 0;
}
}

if (obj_auxi->opc == DAOS_OBJ_RPC_FETCH) {
if (task->dt_result == -DER_TX_UNCERTAIN)
obj_auxi->tx_uncertain = 1;
else
obj_auxi->nvme_io_err = 1;
} else {
if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE &&
task->dt_result == -DER_CSUM) {
struct shard_rw_args *rw_arg = &obj_auxi->rw_args;

/** Retry a few times on checksum error on update */
if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
obj_auxi->csum_retry = 1;
rw_arg->csum_retry_cnt++;
D_DEBUG(DB_IO, DF_OID" checksum error on "
"update, retrying\n",
DP_OID(obj->cob_md.omd_id));
} else {
D_ERROR(DF_OID" checksum error on update, "
"too many retries. Failing I/O\n",
DP_OID(obj->cob_md.omd_id));
obj_auxi->io_retry = 0;
}
} else if (task->dt_result != -DER_NVME_IO) {
/* Don't retry update for UNCERTAIN errors */
obj_auxi->io_retry = 0;
}
}
} else {
obj_auxi->io_retry = 0;
Expand Down Expand Up @@ -5140,6 +5140,12 @@ obj_csum_update(struct dc_object *obj, daos_obj_update_t *args, struct obj_auxi_
if (!obj_csum_dedup_candidate(&obj->cob_co->dc_props, args->iods, args->nr))
return 0;

if (obj_auxi->csum_retry) {
/* Release old checksum result and prepare for new calculation */
daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum);
daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums);
}

return dc_obj_csum_update(obj->cob_co->dc_csummer, obj->cob_co->dc_props,
obj->cob_md.omd_id, args->dkey, args->iods, args->sgls, args->nr,
obj_auxi->reasb_req.orr_singv_los, &obj_auxi->rw_args.dkey_csum,
Expand All @@ -5150,6 +5156,12 @@ static int
obj_csum_fetch(const struct dc_object *obj, daos_obj_fetch_t *args,
struct obj_auxi_args *obj_auxi)
{
if (obj_auxi->csum_retry) {
/* Release old checksum result and prepare for new calculation */
daos_csummer_free_ci(obj->cob_co->dc_csummer, &obj_auxi->rw_args.dkey_csum);
daos_csummer_free_ic(obj->cob_co->dc_csummer, &obj_auxi->rw_args.iod_csums);
}

return dc_obj_csum_fetch(obj->cob_co->dc_csummer, args->dkey, args->iods, args->sgls,
args->nr, obj_auxi->reasb_req.orr_singv_los,
&obj_auxi->rw_args.dkey_csum, &obj_auxi->rw_args.iod_csums);
Expand Down
14 changes: 4 additions & 10 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1371,11 +1372,8 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
orw->orw_dkey_csum, &orw->orw_iod_array,
&orw->orw_oid);
if (rc != 0) {
D_ERROR(DF_C_UOID_DKEY"verify_keys error: "DF_RC"\n",
DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey),
DP_RC(rc));
if (rc == -DER_CSUM)
obj_log_csum_err();
D_ERROR(DF_C_UOID_DKEY "verify_keys error: " DF_RC "\n",
DP_C_UOID_DKEY(orw->orw_oid, &orw->orw_dkey), DP_RC(rc));
return rc;
}

Expand Down Expand Up @@ -4598,12 +4596,8 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
rc = csum_verify_keys(ioc->ioc_coc->sc_csummer,
&dcsr->dcsr_dkey, dcu->dcu_dkey_csum,
&dcu->dcu_iod_array, &dcsr->dcsr_oid);
if (rc != 0) {
if (rc == -DER_CSUM)
obj_log_csum_err();

if (rc != 0)
goto out;
}

if (iohs == NULL) {
D_ALLOC_ARRAY(iohs, dcde->dcde_write_cnt);
Expand Down
28 changes: 23 additions & 5 deletions src/tests/suite/daos_checksum.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2019-2023 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -17,6 +18,7 @@
#include <daos/object.h>

#define EC_CSUM_OC OC_EC_2P2G1
#define RP_CSUM_OC OC_RP_2G1

static void
iov_update_fill(d_iov_t *iov, char *data, uint64_t len_to_fill);
Expand Down Expand Up @@ -45,7 +47,14 @@ csum_ec_enable(void **state)
}

static inline int
csum_replia_enable(void **state)
csum_replica_enable(void **state)
{
dts_csum_oc = RP_CSUM_OC;
return 0;
}

static inline int
csum_shard_enable(void **state)
{
dts_csum_oc = OC_SX;
return 0;
Expand Down Expand Up @@ -2781,10 +2790,18 @@ setup(void **state)

#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
#define CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \
csum_replia_enable, test_case_teardown }
#define EC_CSUM_TEST(dsc, test) { STR(__COUNTER__)". " dsc, test, \
csum_ec_enable, test_case_teardown }
#define CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_shard_enable, test_case_teardown \
}
#define EC_CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_ec_enable, test_case_teardown \
}
#define RP_CSUM_TEST(dsc, test) \
{ \
STR(__COUNTER__) ". " dsc, test, csum_replica_enable, test_case_teardown \
}

static const struct CMUnitTest csum_tests[] = {
CSUM_TEST("DAOS_CSUM00: csum disabled", checksum_disabled),
Expand Down Expand Up @@ -2815,6 +2832,7 @@ static const struct CMUnitTest csum_tests[] = {
CSUM_TEST("DAOS_CSUM07: Mix of Single Value and Array values iods", mix_test),
CSUM_TEST("DAOS_CSUM08: Update/Fetch A Key", test_update_fetch_a_key),
CSUM_TEST("DAOS_CSUM09: Update/Fetch D Key", test_update_fetch_d_key),
RP_CSUM_TEST("DAOS_CSUM09.2: Update/Fetch D Key", test_update_fetch_d_key),
CSUM_TEST("DAOS_CSUM10: Enumerate A Keys", test_enumerate_a_key),
CSUM_TEST("DAOS_CSUM11: Enumerate D Keys", test_enumerate_d_key),
CSUM_TEST("DAOS_CSUM12: Enumerate objects", test_enumerate_object),
Expand Down

0 comments on commit eb6a7d1

Please sign in to comment.