Skip to content

Commit

Permalink
DAOS-16971 cart: add RPC origin address
Browse files Browse the repository at this point in the history
Add cart RPC origin address to error handling.

Run-GHA: true
Allow-unstable-test: true
Signed-off-by: Di Wang <[email protected]>
  • Loading branch information
wangdi1 committed Jan 30, 2025
1 parent 63b0489 commit 24f7c0c
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 29 deletions.
19 changes: 19 additions & 0 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1867,3 +1868,21 @@ crt_hg_bulk_transfer(struct crt_bulk_desc *bulk_desc, crt_bulk_cb_t complete_cb,
out:
return rc;
}

int
crt_rpc_get_origin_addr(crt_rpc_t *rpc_pub, char *addr, uint32_t *addr_size)
{
const struct hg_info *hg_info;
struct crt_rpc_priv *rpc_priv;

rpc_priv = container_of(rpc_pub, struct crt_rpc_priv, crp_pub);
hg_info = HG_Get_info(rpc_priv->crp_hg_hdl);
if (hg_info == NULL) {
snprintf(addr, *addr_size, "NULL");
*addr_size = strlen("NULL");
return 0;
}

HG_Addr_to_string(hg_info->hg_class, addr, (hg_size_t *)addr_size, hg_info->addr);
return 0;
}
12 changes: 9 additions & 3 deletions src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1595,9 +1596,14 @@ crt_reply_send(crt_rpc_t *req)
} else {
RPC_TRACE(DB_ALL, rpc_priv, "reply_send\n");
rc = crt_hg_reply_send(rpc_priv);
if (rc != 0)
D_ERROR("crt_hg_reply_send failed, rc: %d,opc: %#x.\n",
rc, rpc_priv->crp_pub.cr_opc);
if (rc != 0) {
char addr[48];
uint32_t addr_size = 48;

crt_rpc_get_origin_addr(req, addr, &addr_size);
D_ERROR("crt_hg_reply_send failed, rc: %d,opc: %#x.: %s\n", rc,
rpc_priv->crp_pub.cr_opc, addr);
}
}

rpc_priv->crp_reply_pending = 0;
Expand Down
13 changes: 13 additions & 0 deletions src/include/cart/api.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -2375,6 +2376,18 @@ int crt_context_quota_limit_get(crt_context_t crt_ctx, crt_quota_type_t quota, i
int
crt_req_get_proto_ver(crt_rpc_t *req);

/**
* Get the rpc original address.
*
* \param[in] rpc pointer to RPC request
* \param[out] addr pointer to the converted buffer.
* \param[in/out] addr_size the size of the converted buffer.
* \return DER_SUCCESS on success, negative value on failure
*
*/
int
crt_rpc_get_origin_addr(crt_rpc_t *rpc, char *addr, uint32_t *addr_size);

/** @}
*/

Expand Down
108 changes: 83 additions & 25 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -182,9 +183,17 @@ obj_rw_reply(crt_rpc_t *rpc, int status, uint64_t epoch, bool release_input,
orwo->orw_epoch = max(epoch, orwo->orw_epoch);
}

D_DEBUG(DB_IO, "rpc %p opc %d send reply, pmv %d, epoch "DF_X64
", status %d\n", rpc, opc_get(rpc->cr_opc),
ioc->ioc_map_ver, orwo->orw_epoch, status);
if (status != 0 && !obj_retry_error(status)) {
char addr[48];
uint32_t addr_size = 48;

crt_rpc_get_origin_addr(rpc, addr, &addr_size);
D_ERROR("rpc %p opc %d send reply, pmv %d, epoch " DF_X64 ", status %d from %s\n",
rpc, opc_get(rpc->cr_opc), ioc->ioc_map_ver, orwo->orw_epoch, status, addr);
} else {
D_DEBUG(DB_IO, "rpc %p opc %d send reply, pmv %d, epoch " DF_X64 ", status %d\n",
rpc, opc_get(rpc->cr_opc), ioc->ioc_map_ver, orwo->orw_epoch, status);
}

if (!ioc->ioc_lost_reply) {
if (release_input)
Expand Down Expand Up @@ -241,8 +250,13 @@ obj_bulk_comp_cb(const struct crt_bulk_cb_info *cb_info)
struct crt_bulk_desc *bulk_desc;
crt_rpc_t *rpc;

if (cb_info->bci_rc != 0)
D_ERROR("bulk transfer failed: %d\n", cb_info->bci_rc);
if (cb_info->bci_rc != 0) {
char addr[48];
uint32_t addr_size = 48;

crt_rpc_get_origin_addr(cb_info->bci_bulk_desc->bd_rpc, addr, &addr_size);
D_ERROR("bulk transfer failed: %d from %s\n", cb_info->bci_rc, addr);
}

bulk_desc = cb_info->bci_bulk_desc;
rpc = bulk_desc->bd_rpc;
Expand Down Expand Up @@ -5775,19 +5789,43 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc)
if (max_ver < version)
max_ver = version;

DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
"(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/"
DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, "
"bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
(ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
(ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch,
ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);

if (rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART) {
char addr[48];
uint32_t addr_size = 48;

crt_rpc_get_origin_addr(rpc, addr, &addr_size);
D_ERROR(
"(%s) handled collective punch RPC %p for obj " DF_UOID
" on XS %u/%u in " DF_UUID "/" DF_UUID "/" DF_UUID " with epc " DF_X64
", pmv %u/%u, dti " DF_DTI ", bulk_tgt_sz %u, "
"bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x:" DF_RC
" from %s",
(ocpi->ocpi_flags & ORF_LEADER)
? "leader"
: (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver,
DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr,
(unsigned int)ocpi->ocpi_tgts.ca_count, ocpi->ocpi_disp_width,
ocpi->ocpi_disp_depth, ocpi->ocpi_flags, DP_RC(rc), addr);
} else {
DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR,
DB_IO, rc,
"(%s) handled collective punch RPC %p for obj " DF_UOID
" on XS %u/%u in " DF_UUID "/" DF_UUID "/" DF_UUID " with epc " DF_X64
", pmv %u/%u, dti " DF_DTI ", bulk_tgt_sz %u, "
"bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
(ocpi->ocpi_flags & ORF_LEADER)
? "leader"
: (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver,
max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
}
obj_punch_complete(rpc, rc, max_ver);

dtx_coll_entry_put(dce);
Expand Down Expand Up @@ -5918,13 +5956,33 @@ ds_obj_coll_query_handler(crt_rpc_t *rpc)
rc = dtx_leader_end(dlh, ioc.ioc_coh, rc);

out:
D_DEBUG(DB_IO, "Handled collective query RPC %p %s forwarding for obj "DF_UOID
" on rank %u XS %u/%u epc "DF_X64" pmv %u, with dti "DF_DTI", dct_nr %u, "
"forward width %u, forward depth %u\n: "DF_RC"\n", rpc,
ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", DP_UOID(ocqi->ocqi_oid),
myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch, ocqi->ocqi_map_ver,
DP_DTI(&ocqi->ocqi_xid), (unsigned int)ocqi->ocqi_tgts.ca_count,
ocqi->ocqi_disp_width, ocqi->ocqi_disp_depth, DP_RC(rc));

if (rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART) {
char addr[48];
uint32_t addr_size = 48;

crt_rpc_get_origin_addr(rpc, addr, &addr_size);
D_ERROR("Handled collective query RPC %p %s :%s forwarding for obj " DF_UOID
" on rank %u XS %u/%u epc " DF_X64 " pmv %u, with dti " DF_DTI
", dct_nr %u, "
"forward width %u, forward depth %u\n: " DF_RC "\n",
rpc, ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", addr,
DP_UOID(ocqi->ocqi_oid), myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch,
ocqi->ocqi_map_ver, DP_DTI(&ocqi->ocqi_xid),
(unsigned int)ocqi->ocqi_tgts.ca_count, ocqi->ocqi_disp_width,
ocqi->ocqi_disp_depth, DP_RC(rc));
} else {
D_DEBUG(DB_IO,
"Handled collective query RPC %p %s forwarding for obj " DF_UOID
" on rank %u XS %u/%u epc " DF_X64 " pmv %u, with dti " DF_DTI
", dct_nr %u, "
"forward width %u, forward depth %u\n: " DF_RC "\n",
rpc, ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with",
DP_UOID(ocqi->ocqi_oid), myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch,
ocqi->ocqi_map_ver, DP_DTI(&ocqi->ocqi_xid),
(unsigned int)ocqi->ocqi_tgts.ca_count, ocqi->ocqi_disp_width,
ocqi->ocqi_disp_depth, DP_RC(rc));
}

obj_reply_set_status(rpc, rc);
obj_reply_map_version_set(rpc, version);
Expand Down
2 changes: 1 addition & 1 deletion utils/build.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git

[patch_versions]
spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch,utils/mrecv_err.patch
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch,utils/mrecv_err.patch,utils/mercury_export.diff
pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
ofi=utils/ofi_tcp.patch
23 changes: 23 additions & 0 deletions utils/mercury_export.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
diff --git a/src/mercury.h b/src/mercury.h
index 5f720aa..e58f43a 100644
--- a/src/mercury.h
+++ b/src/mercury.h
@@ -1134,6 +1134,18 @@ static HG_INLINE hg_return_t
HG_Event_trigger(hg_context_t *context, unsigned int max_count,
unsigned int *actual_count_p);

+/**
+ * Convert addr to string
+ *
+ * \param context [IN] pointer to HG class
+ * \param buf [OUT] buf to hold the converted address
+ * \param buf_size_p [OUT] size of the buf to hold the converted address
+ * \param addr [IN] address to be converted
+ */
+HG_PUBLIC hg_return_t
+HG_Addr_to_string(hg_class_t *hg_class, char *buf, hg_size_t *buf_size_p,
+ hg_addr_t addr);
+
/************************************/
/* Local Type and Struct Definition */
/************************************/

0 comments on commit 24f7c0c

Please sign in to comment.