Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16971 cart: add RPC origin address #15820

Merged
merged 8 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/cart/crt_bulk.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2022 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -183,7 +184,8 @@

rc = crt_hg_bulk_transfer(bulk_desc, complete_cb, arg, opid, false);
if (rc != 0)
DL_ERROR(rc, "crt_hg_bulk_transfer() failed");
DL_ERROR(rc, "%p:%s crt_hg_bulk_transfer() failed", bulk_desc->bd_rpc,
crt_rpc_get_origin_addr(bulk_desc->bd_rpc));

out:
return rc;
Expand All @@ -203,7 +205,8 @@

rc = crt_hg_bulk_transfer(bulk_desc, complete_cb, arg, opid, true);
if (rc != 0)
D_ERROR("crt_hg_bulk_transfer() failed, rc: %d.\n", rc);
DL_ERROR(rc, "%p:%s crt_hg_bulk_transfer() failed.\n", bulk_desc->bd_rpc,

Check warning on line 208 in src/cart/crt_bulk.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to remove "\n" from the message as DL_ERROR will append one on its own

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, sure

crt_rpc_get_origin_addr(bulk_desc->bd_rpc));

out:
return rc;
Expand Down
10 changes: 10 additions & 0 deletions src/cart/crt_hg.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1867,3 +1868,12 @@ crt_hg_bulk_transfer(struct crt_bulk_desc *bulk_desc, crt_bulk_cb_t complete_cb,
out:
return rc;
}

char *
crt_rpc_get_origin_addr(crt_rpc_t *rpc_pub)
{
struct crt_rpc_priv *rpc_priv;

rpc_priv = container_of(rpc_pub, struct crt_rpc_priv, crp_pub);
return crt_rpc_priv_get_origin_addr(rpc_priv);
}
47 changes: 39 additions & 8 deletions src/cart/crt_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -26,6 +27,34 @@
#include "crt_self_test.h"
#include "crt_swim.h"

static inline char *
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should go into crt_internal_fns.h for declaration and crt_rpc.c for the implementation. I don't think there is much benefit from inlining this function anyway

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is embeded into the D_DEBUG/RPC_TRACE, inline function at least reduce the overhead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but organizationally it should still be in crt_interal_fns.h -- similar to how we have all other functions there. See crt_opc_decode() which is also used in RPC_* macros.

I don't think an overhead of a function call matters too much in debug case (compared to all the slowdown generated by emitting and logging messages)

crt_rpc_priv_get_origin_addr(struct crt_rpc_priv *rpc_priv)
{
const struct hg_info *hg_info;
char addr[48];
hg_size_t addr_size = 48;
int rc;

if (rpc_priv->crp_orig_uri != NULL)
return rpc_priv->crp_orig_uri;

hg_info = HG_Get_info(rpc_priv->crp_hg_hdl);
if (hg_info == NULL)
return "None";

rc = HG_Addr_to_string(hg_info->hg_class, addr, (hg_size_t *)&addr_size, hg_info->addr);
if (rc != 0)
return "None";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should either print an error for each case or somehow differentiate None's.
Imagine you see "None" in log, that would be unexpected, but wouldn't tell us which one of those 3 calls failed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

client side RPC trace should have no origin address, so I put None here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, but the other 2 errors are return 'None', so if you see 'None' you dont know if this call failed or 2 others. Maybe return 'ERR' or something else in other error cases


D_ALLOC(rpc_priv->crp_orig_uri, addr_size);
if (rpc_priv->crp_orig_uri == NULL)
return "None";

memcpy(rpc_priv->crp_orig_uri, addr, addr_size);

return rpc_priv->crp_orig_uri;
}

/* A wrapper around D_TRACE_DEBUG that ensures the ptr option is a RPC */
#define RPC_TRACE(mask, rpc, fmt, ...) \
do { \
Expand All @@ -36,10 +65,11 @@
break; \
\
crt_opc_decode((rpc)->crp_pub.cr_opc, &_module, &_opc); \
D_TRACE_DEBUG(mask, (rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d] " fmt, \
D_TRACE_DEBUG(mask, (rpc), \
"[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d orig=%s] " fmt, \
(rpc)->crp_pub.cr_opc, _module, _opc, (rpc)->crp_req_hdr.cch_rpcid, \
(rpc)->crp_pub.cr_ep.ep_rank, (rpc)->crp_pub.cr_ep.ep_tag, \
##__VA_ARGS__); \
crt_rpc_priv_get_origin_addr((rpc)), ##__VA_ARGS__); \
} while (0)

/* Log an error with an RPC descriptor */
Expand All @@ -49,10 +79,10 @@
char *_opc; \
\
crt_opc_decode((rpc)->crp_pub.cr_opc, &_module, &_opc); \
D_TRACE_ERROR((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d] " fmt, \
D_TRACE_ERROR((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d orig=%s] " fmt, \
(rpc)->crp_pub.cr_opc, _module, _opc, (rpc)->crp_req_hdr.cch_rpcid, \
(rpc)->crp_pub.cr_ep.ep_rank, (rpc)->crp_pub.cr_ep.ep_tag, \
##__VA_ARGS__); \
crt_rpc_priv_get_origin_addr((rpc)), ##__VA_ARGS__); \
} while (0)

/* Log a warning with an RPC descriptor */
Expand All @@ -62,10 +92,10 @@
char *_opc; \
\
crt_opc_decode((rpc)->crp_pub.cr_opc, &_module, &_opc); \
D_TRACE_WARN((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d] " fmt, \
D_TRACE_WARN((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d orig=%s] " fmt, \
(rpc)->crp_pub.cr_opc, _module, _opc, (rpc)->crp_req_hdr.cch_rpcid, \
(rpc)->crp_pub.cr_ep.ep_rank, (rpc)->crp_pub.cr_ep.ep_tag, \
##__VA_ARGS__); \
crt_rpc_priv_get_origin_addr((rpc)), ##__VA_ARGS__); \
} while (0)

/* Log an info message with an RPC descriptor */
Expand All @@ -75,11 +105,12 @@
char *_opc; \
\
crt_opc_decode((rpc)->crp_pub.cr_opc, &_module, &_opc); \
D_TRACE_INFO((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d] " fmt, \
D_TRACE_INFO((rpc), "[opc=%#x (%s:%s) rpcid=%#lx rank:tag=%d:%d orig=%s] " fmt, \
(rpc)->crp_pub.cr_opc, _module, _opc, (rpc)->crp_req_hdr.cch_rpcid, \
(rpc)->crp_pub.cr_ep.ep_rank, (rpc)->crp_pub.cr_ep.ep_tag, \
##__VA_ARGS__); \
crt_rpc_priv_get_origin_addr((rpc)), ##__VA_ARGS__); \
} while (0)

/**
* If \a cond is false, this is equivalent to an RPC_ERROR (i.e., \a mask is
* ignored). If \a cond is true, this is equivalent to an RPC_TRACE.
Expand Down
8 changes: 6 additions & 2 deletions src/cart/crt_rpc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -573,6 +574,9 @@ crt_rpc_priv_free(struct crt_rpc_priv *rpc_priv)
if (rpc_priv->crp_uri_free != 0)
D_FREE(rpc_priv->crp_tgt_uri);

if (rpc_priv->crp_orig_uri != NULL)
D_FREE(rpc_priv->crp_orig_uri);

D_MUTEX_DESTROY(&rpc_priv->crp_mutex);
D_SPIN_DESTROY(&rpc_priv->crp_lock);

Expand Down Expand Up @@ -1596,8 +1600,8 @@ crt_reply_send(crt_rpc_t *req)
RPC_TRACE(DB_ALL, rpc_priv, "reply_send\n");
rc = crt_hg_reply_send(rpc_priv);
if (rc != 0)
D_ERROR("crt_hg_reply_send failed, rc: %d,opc: %#x.\n",
rc, rpc_priv->crp_pub.cr_opc);
D_ERROR("crt_hg_reply_send failed, rc: %d,opc: %#x.: %s\n", rc,
rpc_priv->crp_pub.cr_opc, crt_rpc_get_origin_addr(req));
}

rpc_priv->crp_reply_pending = 0;
Expand Down
2 changes: 2 additions & 0 deletions src/cart/crt_rpc.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -154,6 +155,7 @@ struct crt_rpc_priv {
hg_addr_t crp_hg_addr; /* target na address */
struct crt_hg_hdl *crp_hdl_reuse; /* reused hg_hdl */
crt_phy_addr_t crp_tgt_uri; /* target uri address */
char *crp_orig_uri; /* where the RPC comes from */
crt_rpc_t *crp_ul_req; /* uri lookup request */

uint32_t crp_ul_retry; /* uri lookup retry counter */
Expand Down
10 changes: 10 additions & 0 deletions src/include/cart/api.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -2375,6 +2376,15 @@ int crt_context_quota_limit_get(crt_context_t crt_ctx, crt_quota_type_t quota, i
int
crt_req_get_proto_ver(crt_rpc_t *req);

/**
* Get the rpc original address.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/original/origin/ ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still a typo:)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, ok

*
* \param[in] rpc pointer to RPC request
* \return the origin address of the RPC
*/
char *
crt_rpc_get_origin_addr(crt_rpc_t *rpc);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The api functions follow format of crt_[module][verb] (see crt_context_quota_limit_get, crt_context_create and others)
Also in public APIs we refer to 'rpc' as 'req' (e.g. crt_req_create(), crt_req_get_proto_ver() etc) in order to differentiate with crt_rpc
functions that are internal.
So based on those 2 things, this should be renamed to something like crt_req_origin_addr_get()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, ok


/** @}
*/

Expand Down
50 changes: 29 additions & 21 deletions src/object/srv_obj.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -182,9 +183,10 @@
orwo->orw_epoch = max(epoch, orwo->orw_epoch);
}

D_DEBUG(DB_IO, "rpc %p opc %d send reply, pmv %d, epoch "DF_X64
", status %d\n", rpc, opc_get(rpc->cr_opc),
ioc->ioc_map_ver, orwo->orw_epoch, status);
DL_CDEBUG(status == 0 || obj_retry_error(status), DB_IO, DLOG_ERR, status,

Check warning on line 186 in src/object/srv_obj.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
"rpc %p:%s opc %d send reply, pmv %d, epoch " DF_X64 ", status %d\n", rpc,
crt_rpc_get_origin_addr(rpc), opc_get(rpc->cr_opc), ioc->ioc_map_ver,
orwo->orw_epoch, status);

if (!ioc->ioc_lost_reply) {
if (release_input)
Expand Down Expand Up @@ -241,11 +243,13 @@
struct crt_bulk_desc *bulk_desc;
crt_rpc_t *rpc;

if (cb_info->bci_rc != 0)
D_ERROR("bulk transfer failed: %d\n", cb_info->bci_rc);

bulk_desc = cb_info->bci_bulk_desc;
rpc = bulk_desc->bd_rpc;

if (cb_info->bci_rc != 0)
D_ERROR("rpc: %p:%s bulk transfer failed: %d\n", rpc, crt_rpc_get_origin_addr(rpc),
cb_info->bci_rc);

arg = (struct obj_bulk_args *)cb_info->bci_arg;
/**
* Note: only one thread will access arg.result, so
Expand Down Expand Up @@ -5776,17 +5780,19 @@
max_ver = version;

DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc,
"(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u in "DF_UUID"/"
DF_UUID"/"DF_UUID" with epc "DF_X64", pmv %u/%u, dti "DF_DTI", bulk_tgt_sz %u, "
"(%s) handled collective punch RPC %p:%s for obj " DF_UOID
" on XS %u/%u in " DF_UUID "/" DF_UUID "/" DF_UUID " with epc " DF_X64
", pmv %u/%u, dti " DF_DTI ", bulk_tgt_sz %u, "
"bulk_tgt_nr %u, tgt_nr %u, forward width %u, forward depth %u, flags %x",
(ocpi->ocpi_flags & ORF_LEADER) ? "leader" :
(ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc,
DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id,
DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch,
ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz,
ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count,
ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth, ocpi->ocpi_flags);
(ocpi->ocpi_flags & ORF_LEADER)
? "leader"
: (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"),
rpc, crt_rpc_get_origin_addr(rpc), DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id,
dmi->dmi_tgt_id, DP_UUID(ocpi->ocpi_po_uuid), DP_UUID(ocpi->ocpi_co_hdl),
DP_UUID(ocpi->ocpi_co_uuid), ocpi->ocpi_epoch, ocpi->ocpi_map_ver, max_ver,
DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, ocpi->ocpi_bulk_tgt_nr,
(unsigned int)ocpi->ocpi_tgts.ca_count, ocpi->ocpi_disp_width,
ocpi->ocpi_disp_depth, ocpi->ocpi_flags);

obj_punch_complete(rpc, rc, max_ver);

Expand Down Expand Up @@ -5918,11 +5924,13 @@
rc = dtx_leader_end(dlh, ioc.ioc_coh, rc);

out:
D_DEBUG(DB_IO, "Handled collective query RPC %p %s forwarding for obj "DF_UOID
" on rank %u XS %u/%u epc "DF_X64" pmv %u, with dti "DF_DTI", dct_nr %u, "
"forward width %u, forward depth %u\n: "DF_RC"\n", rpc,
ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", DP_UOID(ocqi->ocqi_oid),
myrank, dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch, ocqi->ocqi_map_ver,
D_DEBUG(DB_IO,
"Handled collective query RPC %p:%s %s forwarding for obj " DF_UOID
" on rank %u XS %u/%u epc " DF_X64 " pmv %u, with dti " DF_DTI ", dct_nr %u, "
"forward width %u, forward depth %u\n: " DF_RC "\n",
rpc, crt_rpc_get_origin_addr(rpc),
ocqi->ocqi_tgts.ca_count <= 1 ? "without" : "with", DP_UOID(ocqi->ocqi_oid), myrank,
dmi->dmi_xs_id, tgt_id, ocqi->ocqi_epoch, ocqi->ocqi_map_ver,
DP_DTI(&ocqi->ocqi_xid), (unsigned int)ocqi->ocqi_tgts.ca_count,
ocqi->ocqi_disp_width, ocqi->ocqi_disp_depth, DP_RC(rc));

Expand Down
Loading