Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-8331 client: Add client side metrics (#14030) #14204

Merged
merged 3 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,15 @@ This file lists the environment variables used in CaRT.
. CRT_TEST_CONT
When set to 1, orterun does not automatically shut down other servers when
one server is shutdown. Used in cart internal testing.

. D_CLIENT_METRICS_ENABLE
When set to 1, client side metrics will be collected on each daos client, which
can by retrieved by daos_metrics -j job_id on each client.

. D_CLIENT_METRICS_RETAIN
when set to 1, client side metrics will be retained even after the job exits, i.e.
those metrics can be retrieved by daos_metrics even after job exits.

. D_CLIENT_METRICS_DUMP_PATH
Set client side metrics dump path(file) for each client, so these metrics will be
dumped to the specified file when the job exits.
94 changes: 52 additions & 42 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,50 @@ static volatile int gdata_init_flag;
struct crt_plugin_gdata crt_plugin_gdata;
static bool g_prov_settings_applied[CRT_PROV_COUNT];

/* List of the environment variables used in CaRT */
static const char *crt_env_names[] = {
"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR",
"DAOS_SIGNAL_REGISTER",
"D_CLIENT_METRICS_ENABLE",
"D_CLIENT_METRICS_RETAIN",
"D_CLIENT_METRICS_DUMP_PATH",

};

static void
crt_lib_init(void) __attribute__((__constructor__));

Expand Down Expand Up @@ -62,53 +106,19 @@ crt_lib_fini(void)
static void
dump_envariables(void)
{
int i;
char *val;
static const char *var_names[] = {"D_PROVIDER",
"D_INTERFACE",
"D_DOMAIN",
"D_PORT",
"CRT_PHY_ADDR_STR",
"D_LOG_STDERR_IN_LOG",
"D_LOG_SIZE",
"D_LOG_FILE",
"D_LOG_FILE_APPEND_PID",
"D_LOG_MASK",
"DD_MASK",
"DD_STDERR",
"DD_SUBSYS",
"CRT_TIMEOUT",
"CRT_ATTACH_INFO_PATH",
"OFI_PORT",
"OFI_INTERFACE",
"OFI_DOMAIN",
"CRT_CREDIT_EP_CTX",
"CRT_CTX_SHARE_ADDR",
"CRT_CTX_NUM",
"D_FI_CONFIG",
"FI_UNIVERSE_SIZE",
"CRT_ENABLE_MEM_PIN",
"FI_OFI_RXM_USE_SRX",
"D_LOG_FLUSH",
"CRT_MRC_ENABLE",
"CRT_SECONDARY_PROVIDER",
"D_PROVIDER_AUTH_KEY",
"D_PORT_AUTO_ADJUST",
"D_POLL_TIMEOUT",
"D_LOG_FILE_APPEND_RANK",
"D_QUOTA_RPCS",
"D_POST_INIT",
"D_POST_INCR"};
int i;

D_INFO("-- ENVARS: --\n");
for (i = 0; i < ARRAY_SIZE(var_names); i++) {
d_agetenv_str(&val, var_names[i]);
for (i = 0; i < ARRAY_SIZE(crt_env_names); i++) {
char *val = NULL;

d_agetenv_str(&val, crt_env_names[i]);
if (val == NULL)
continue;
if (strcmp(var_names[i], "D_PROVIDER_AUTH_KEY") == 0)
D_INFO("%s = %s\n", var_names[i], "********");
if (strcmp(crt_env_names[i], "D_PROVIDER_AUTH_KEY") == 0)
D_INFO("%s = %s\n", crt_env_names[i], "********");
else
D_INFO("%s = %s\n", var_names[i], val);
D_INFO("%s = %s\n", crt_env_names[i], val);
d_freeenv_str(&val);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/client/api/SConscript
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Build DAOS client"""

LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c',
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c']
'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'metrics.c']


def scons():
Expand Down
14 changes: 13 additions & 1 deletion src/client/api/init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand All @@ -23,6 +23,7 @@
#include <daos/btree_class.h>
#include <daos/placement.h>
#include <daos/job.h>
#include <daos/metrics.h>
#include "task_internal.h"
#include <pthread.h>

Expand Down Expand Up @@ -219,6 +220,13 @@ daos_init(void)
if (rc != 0)
D_GOTO(out_pl, rc);

/** set up client telemetry */
rc = dc_tm_init();
if (rc != 0) {
/* should not be fatal */
DL_WARN(rc, "failed to initialize client telemetry");
}

/** set up pool */
rc = dc_pool_init();
if (rc != 0)
Expand All @@ -242,6 +250,7 @@ daos_init(void)
out_pool:
dc_pool_fini();
out_mgmt:
dc_tm_fini();
dc_mgmt_fini();
out_pl:
pl_fini();
Expand Down Expand Up @@ -291,6 +300,8 @@ daos_fini(void)
D_GOTO(unlock, rc);
}

/** clean up all registered per-module metrics */
daos_metrics_fini();
dc_obj_fini();
dc_cont_fini();
dc_pool_fini();
Expand All @@ -301,6 +312,7 @@ daos_fini(void)
D_ERROR("failed to disconnect some resources may leak, "
DF_RC"\n", DP_RC(rc));

dc_tm_fini();
dc_agent_fini();
dc_job_fini();

Expand Down
Loading
Loading