Skip to content

Commit

Permalink
DAOS-14739 pool: Add service-level metrics (#14273)
Browse files Browse the repository at this point in the history
Adds a new /svc group under each pool which contains
the following set of metrics:
  * leader (gauge): Current pool service leader rank
  * map_version (counter): Current pool map version
  * open_pool_handles (gauge): Current count of open handles
  * total_ranks (gauge): Number of ranks in pool map
  * degraded_ranks (gauge): Number of ranks with disabled targets
  * total_targets (gauge): Number of targets in pool map
  * disabled_targets (gauge): Number of targets marked disabled
  * draining_targets (gauge): Number of targets in draining state

For non-leader ranks, the service metrics will have zero
values. Telemetry consumers may positively identify the
current leader by checking the value of map_version, which
will always be non-zero for the leader.

Required-githooks: true

Change-Id: I6e82db981247f3e4fe4e2b434a688d4083be158c
Signed-off-by: Michael MacDonald <mjmac@google.com>
  • Loading branch information
mjmac committed May 15, 2024
1 parent f16a7dd commit 116ec0e
Show file tree
Hide file tree
Showing 6 changed files with 377 additions and 6 deletions.
12 changes: 11 additions & 1 deletion src/pool/srv_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -28,6 +28,16 @@ struct pool_metrics {
struct d_tm_node_t *query_total;
struct d_tm_node_t *query_space_total;
struct d_tm_node_t *evict_total;

/* service metrics */
struct d_tm_node_t *service_leader;
struct d_tm_node_t *map_version;
struct d_tm_node_t *open_handles;
struct d_tm_node_t *total_targets;
struct d_tm_node_t *disabled_targets;
struct d_tm_node_t *draining_targets;
struct d_tm_node_t *total_ranks;
struct d_tm_node_t *degraded_ranks;
};

/* Pool thread-local storage */
Expand Down
44 changes: 43 additions & 1 deletion src/pool/srv_metrics.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2021-2022 Intel Corporation.
* (C) Copyright 2021-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -65,6 +65,48 @@ ds_pool_metrics_alloc(const char *path, int tgt_id)
if (rc != 0)
D_WARN("Failed to create pool query space counter: "DF_RC"\n", DP_RC(rc));

rc = d_tm_add_metric(&metrics->service_leader, D_TM_GAUGE, "Pool service leader rank", NULL,
"%s/svc/leader", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool service leader metric");

rc = d_tm_add_metric(&metrics->map_version, D_TM_COUNTER, "Pool map version", NULL,
"%s/svc/map_version", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool map version metric");

rc = d_tm_add_metric(&metrics->open_handles, D_TM_GAUGE, "Pool handles held by clients",
NULL, "%s/svc/open_pool_handles", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool handle metric");

rc = d_tm_add_metric(&metrics->total_ranks, D_TM_GAUGE, "Pool storage ranks (total)", NULL,
"%s/svc/total_ranks", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool total_ranks metric");

rc = d_tm_add_metric(&metrics->degraded_ranks, D_TM_GAUGE, "Pool storage ranks (degraded)",
NULL, "%s/svc/degraded_ranks", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool degraded_ranks metric");

rc = d_tm_add_metric(&metrics->total_targets, D_TM_GAUGE, "Pool storage targets (total)",
NULL, "%s/svc/total_targets", path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool total_targets metric");

rc = d_tm_add_metric(&metrics->draining_targets, D_TM_GAUGE,
"Pool storage targets (draining)", NULL, "%s/svc/draining_targets",
path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool draining_targets metric");

rc = d_tm_add_metric(&metrics->disabled_targets, D_TM_GAUGE,
"Pool storage targets (disabled)", NULL, "%s/svc/disabled_targets",
path);
if (rc != 0)
DL_WARN(rc, "Failed to create pool disabled_targets metric");

return metrics;
}

Expand Down
155 changes: 154 additions & 1 deletion src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -1620,6 +1620,136 @@ pool_svc_check_node_status(struct pool_svc *svc)
D_PRINT(fmt, ## __VA_ARGS__); \
} while (0)

static int
pool_svc_update_map_metrics(uuid_t uuid, struct pool_map *map, struct pool_metrics *metrics)
{
unsigned int num_total = 0;
unsigned int num_enabled = 0;
unsigned int num_draining = 0;
unsigned int num_disabled = 0;
d_rank_list_t *ranks;
int rc;

if (map == NULL || metrics == NULL)
return -DER_INVAL;

rc = pool_map_find_failed_tgts(map, NULL, &num_disabled);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get failed targets", DP_UUID(uuid));
D_GOTO(out, rc);
}
d_tm_set_gauge(metrics->disabled_targets, num_disabled);

rc = pool_map_find_tgts_by_state(map, PO_COMP_ST_DRAIN, NULL, &num_draining);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get draining targets", DP_UUID(uuid));
D_GOTO(out, rc);
}
d_tm_set_gauge(metrics->draining_targets, num_draining);

rc = pool_map_find_tgts_by_state(map, -1, NULL, &num_total);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get total targets", DP_UUID(uuid));
D_GOTO(out, rc);
}
d_tm_set_gauge(metrics->total_targets, num_total);

rc = pool_map_get_ranks(uuid, map, false, &ranks);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get degraded ranks", DP_UUID(uuid));
D_GOTO(out, rc);
}
num_disabled = ranks->rl_nr;
d_tm_set_gauge(metrics->degraded_ranks, num_disabled);

d_rank_list_free(ranks);
rc = pool_map_get_ranks(uuid, map, true, &ranks);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get enabled ranks", DP_UUID(uuid));
D_GOTO(out, rc);
}
num_enabled = ranks->rl_nr;
d_tm_set_gauge(metrics->total_ranks, num_enabled + num_disabled);

d_rank_list_free(ranks);
out:
return rc;
}

static int
count_iter_cb(daos_handle_t ih, d_iov_t *key, d_iov_t *val, void *varg)
{
uint64_t *counter = varg;

if (counter == NULL)
return -DER_INVAL;
*counter = *counter + 1;

return 0;
}

static int
pool_svc_step_up_metrics(struct pool_svc *svc, d_rank_t leader, uint32_t map_version,
struct pool_buf *map_buf)
{
struct pool_map *map;
struct pool_metrics *metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE];
struct rdb_tx tx;
uint64_t handle_count = 0;
int rc;

rc = pool_map_create(map_buf, map_version, &map);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to create pool map", DP_UUID(svc->ps_uuid));
D_GOTO(out, rc);
}

d_tm_set_gauge(metrics->service_leader, leader);
d_tm_set_counter(metrics->map_version, map_version);

rc = pool_svc_update_map_metrics(svc->ps_uuid, map, metrics);
if (rc != 0) {
DL_WARN(rc, DF_UUID ": failed to update pool metrics", DP_UUID(svc->ps_uuid));
rc = 0; /* not fatal */
}

rc = rdb_tx_begin(svc->ps_rsvc.s_db, svc->ps_rsvc.s_term, &tx);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to get rdb transaction", DP_UUID(svc->ps_uuid));
D_GOTO(out_map, rc);
}

rc = rdb_tx_iterate(&tx, &svc->ps_handles, false, count_iter_cb, &handle_count);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to count open pool handles", DP_UUID(svc->ps_uuid));
D_GOTO(out_tx, rc);
}
d_tm_set_gauge(metrics->open_handles, handle_count);

out_tx:
rdb_tx_end(&tx);
out_map:
pool_map_decref(map);
out:
return rc;
}

static void
pool_svc_step_down_metrics(struct pool_svc *svc)
{
struct pool_metrics *metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE];

/* NB: zero these out to indicate that this rank is not leader */
d_tm_set_gauge(metrics->service_leader, 0);
d_tm_set_counter(metrics->map_version, 0);
d_tm_set_gauge(metrics->open_handles, 0);
d_tm_set_gauge(metrics->draining_targets, 0);
d_tm_set_gauge(metrics->disabled_targets, 0);
d_tm_set_gauge(metrics->total_targets, 0);
d_tm_set_gauge(metrics->degraded_ranks, 0);
d_tm_set_gauge(metrics->total_ranks, 0);
}

static void pool_svc_schedule(struct pool_svc *svc, struct pool_svc_sched *sched,
void (*func)(void *));
static void pool_svc_reconf_ult(void *arg);
Expand Down Expand Up @@ -1718,6 +1848,13 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
if (rc != 0)
goto out;

rc = pool_svc_step_up_metrics(svc, rank, map_version, map_buf);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": failed to initialize pool service metrics",
DP_UUID(svc->ps_uuid));
D_GOTO(out, rc);
}

DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64": srv_pool_hdl="
DF_UUID" srv_cont_hdl="DF_UUID"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_UUID(pool_hdl_uuid), DP_UUID(cont_hdl_uuid));
Expand Down Expand Up @@ -1752,6 +1889,7 @@ pool_svc_step_down_cb(struct ds_rsvc *rsvc)
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
Expand All @@ -1770,7 +1908,8 @@ pool_svc_drain_cb(struct ds_rsvc *rsvc)
static int
pool_svc_map_dist_cb(struct ds_rsvc *rsvc)
{
struct pool_svc *svc = pool_svc_obj(rsvc);
struct pool_svc *svc = pool_svc_obj(rsvc);
struct pool_metrics *metrics;
struct rdb_tx tx;
struct pool_buf *map_buf = NULL;
uint32_t map_version;
Expand All @@ -1797,6 +1936,9 @@ pool_svc_map_dist_cb(struct ds_rsvc *rsvc)
D_GOTO(out, rc);
}
svc->ps_global_map_version = max(svc->ps_global_map_version, map_version);

metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE];
d_tm_set_counter(metrics->map_version, map_version);
out:
if (map_buf != NULL)
D_FREE(map_buf);
Expand Down Expand Up @@ -3025,6 +3167,7 @@ ds_pool_connect_handler(crt_rpc_t *rpc, int handler_version)
/** update metric */
metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE];
d_tm_inc_counter(metrics->connect_total, 1);
d_tm_inc_gauge(metrics->open_handles, 1);

if (in->pci_query_bits & DAOS_PO_QUERY_SPACE)
rc = pool_space_query_bcast(rpc->cr_ctx, svc, in->pci_op.pi_hdl,
Expand Down Expand Up @@ -3116,6 +3259,7 @@ pool_disconnect_hdls(struct rdb_tx *tx, struct pool_svc *svc, uuid_t *hdl_uuids,
{
d_iov_t value;
uint32_t nhandles;
struct pool_metrics *metrics;
int i;
int rc;

Expand Down Expand Up @@ -3159,6 +3303,8 @@ pool_disconnect_hdls(struct rdb_tx *tx, struct pool_svc *svc, uuid_t *hdl_uuids,
if (rc != 0)
D_GOTO(out, rc);

metrics = svc->ps_pool->sp_metrics[DAOS_POOL_MODULE];
d_tm_dec_gauge(metrics->open_handles, n_hdl_uuids);
out:
if (rc == 0)
D_INFO(DF_UUID": success\n", DP_UUID(svc->ps_uuid));
Expand Down Expand Up @@ -6103,6 +6249,13 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
if (opc == MAP_EXCLUDE)
pool_svc_schedule(svc, &svc->ps_rfcheck_sched, pool_svc_rfcheck_ult);

rc = pool_svc_update_map_metrics(svc->ps_uuid, map,
svc->ps_pool->sp_metrics[DAOS_POOL_MODULE]);
if (rc != 0) {
DL_WARN(rc, DF_UUID ": failed to update pool metrics", DP_UUID(svc->ps_uuid));
rc = 0; /* not fatal */
}

out_map_buf:
pool_buf_free(map_buf);
out_map:
Expand Down
Loading

0 comments on commit 116ec0e

Please sign in to comment.