Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14261 engine: Add dss_chore for I/O forwarding (#13372) #14158

Merged
merged 14 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 31 additions & 31 deletions .github/CODEOWNERS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it intentional to comment these out?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. Just realized it's for the google branch :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, just making it so we don't bug everyone.. Unfortunately, until CODEOWNERS lands, it will still do so.

Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,58 @@
# a component sha1 to ensure that corresponding package build is done
#utils/build.config @daos-stack/release-engineering
# or updates packaging in any way
utils/rpms @daos-stack/build-and-release-watchers
#utils/rpms @daos-stack/build-and-release-watchers

src/gurt @daos-stack/common-watchers
src/common @daos-stack/common-watchers
#src/gurt @daos-stack/common-watchers
#src/common @daos-stack/common-watchers

# any PR that touches Go files should get a review from go-owners
*.go @daos-stack/go-owners @daos-stack/go-watchers
#*.go @daos-stack/go-owners @daos-stack/go-watchers

# Notify vos-watcher of files touched affecting VOS
src/vos/ @daos-stack/vos-owners @daos-stack/vos-watchers
src/common/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers
src/include/daos/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers
src/include/daos_srv/vos*.* @daos-stack/vos-owners @daos-stack/vos-watchers
src/include/daos_srv/evtree.h @daos-stack/vos-owners @daos-stack/vos-watchers
#src/vos/ @daos-stack/vos-owners @daos-stack/vos-watchers
#src/common/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers
#src/include/daos/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers
#src/include/daos_srv/vos*.* @daos-stack/vos-owners @daos-stack/vos-watchers
#src/include/daos_srv/evtree.h @daos-stack/vos-owners @daos-stack/vos-watchers

# Jenkinsfile changes should be reviewed by Release Engineering
Jenkinsfile @daos-stack/build-and-release-watchers
#Jenkinsfile @daos-stack/build-and-release-watchers

# any PR that touches client API or high level client code
src/client @daos-stack/client-api-owners @daos-stack/client-api-watchers
src/include/daos_*.* @daos-stack/client-api-owners @daos-stack/client-api-watchers
#src/client @daos-stack/client-api-owners @daos-stack/client-api-watchers
#src/include/daos_*.* @daos-stack/client-api-owners @daos-stack/client-api-watchers

# doc-watchers: files affecting documentation (docs, doxygen, etc.)
mkdocs.yml @daos-stack/doc-watchers
Doxyfile @daos-stack/doc-watchers
docs/ @daos-stack/doc-watchers
#mkdocs.yml @daos-stack/doc-watchers
#Doxyfile @daos-stack/doc-watchers
#docs/ @daos-stack/doc-watchers
#src/include/*.h @daos-stack/doc-watchers
*.md @daos-stack/doc-watchers
#*.md @daos-stack/doc-watchers

# dev-build-watchers: Files affecting local builds (e.g. SCons)
SConstruct @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
SConscript @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
site_scons/ @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
utils/sl @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
#SConstruct @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
#SConscript @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
#site_scons/ @daos-stack/dev-build-owners @daos-stack/dev-build-watchers
#utils/sl @daos-stack/dev-build-owners @daos-stack/dev-build-watchers

# ftest-watchers: files affecting functional tests
src/tests/ftest @daos-stack/ftest-owners @daos-stack/ftest-watchers
#src/tests/ftest @daos-stack/ftest-owners @daos-stack/ftest-watchers

# telem-watchers: Changes related to the telemetry library
src/utils/daos_metrics @daos-stack/telem-watchers
src/gurt/telemetry.c @daos-stack/telem-watchers
#src/utils/daos_metrics @daos-stack/telem-watchers
#src/gurt/telemetry.c @daos-stack/telem-watchers

# PRs that touch the object layer
src/object/ @daos-stack/object-owners @daos-stack/object-watchers
src/rebuild/ @daos-stack/object-owners @daos-stack/object-watchers
src/dtx/ @daos-stack/object-owners @daos-stack/object-watchers
#src/object/ @daos-stack/object-owners @daos-stack/object-watchers
#src/rebuild/ @daos-stack/object-owners @daos-stack/object-watchers
#src/dtx/ @daos-stack/object-owners @daos-stack/object-watchers

# PRs that touch the CaRT layer
src/cart/ @daos-stack/cart-owners @daos-stack/cart-watchers
#src/cart/ @daos-stack/cart-owners @daos-stack/cart-watchers

# PRs that touch the metadata
src/pool/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
src/container/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
src/rdb/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
src/rsvc/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
#src/pool/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
#src/container/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
#src/rdb/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
#src/rsvc/ @daos-stack/metadata-owners @daos-stack/metadata-watchers
8 changes: 4 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -289,16 +289,16 @@ pipeline {
description: 'Run the Functional on Ubuntu 20.04 test stage' +
' Requires CI_MORE_FUNCTIONAL_PR_TESTS')
booleanParam(name: 'CI_medium_TEST',
defaultValue: true,
defaultValue: false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

debugging?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I'm disabling hardware testing on the google branch so we don't use too much resources

description: 'Run the Functional Hardware Medium test stage')
booleanParam(name: 'CI_medium-verbs-provider_TEST',
defaultValue: true,
defaultValue: false,
description: 'Run the Functional Hardware Medium Verbs Provider test stage')
booleanParam(name: 'CI_medium-ucx-provider_TEST',
defaultValue: true,
defaultValue: false,
description: 'Run the Functional Hardware Medium UCX Provider test stage')
booleanParam(name: 'CI_large_TEST',
defaultValue: true,
defaultValue: false,
description: 'Run the Functional Hardware Large test stage')
string(name: 'CI_UNIT_VM1_LABEL',
defaultValue: 'ci_vm1',
Expand Down
4 changes: 2 additions & 2 deletions debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ endif
SCONS = scons -j $(NUMJOBS)
DEB_SCONS_OPTIONS := --config=force --no-rpath USE_INSTALLED=all \
CONF_DIR=$(sysconfdir)/daos PREFIX=$(prefix) \
$(SCONS_ARGS)
$(SCONS_ARGS) STATIC_FUSE=0

export GOCACHE := $(CURDIR)/.gocache

Expand All @@ -34,7 +34,7 @@ override_dh_auto_build:

override_dh_auto_clean:
echo $(DEB_BUILD_OPTIONS)
$(SCONS) --clean
$(SCONS) --clean STATIC_FUSE=0
rm -rf build install
find . -name '*.pyc' -delete
rm -rf _build.external-Linux
Expand Down
2 changes: 1 addition & 1 deletion site_scons/prereq_tools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ def run_build(self, opts):
# argobots is not really needed by client but it's difficult to separate
common_reqs = ['argobots', 'ucx', 'ofi', 'hwloc', 'mercury', 'boost', 'uuid',
'crypto', 'protobufc', 'lz4', 'isal', 'isal_crypto']
client_reqs = ['fuse', 'json-c', 'capstone', 'archive']
client_reqs = ['fuse', 'json-c', 'capstone']
server_reqs = ['pmdk', 'spdk', 'ipmctl']
test_reqs = ['cmocka']

Expand Down
18 changes: 18 additions & 0 deletions src/cart/utils/memcheck-cart.supp
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,24 @@
...
fun:indexbytebody
}
{
go-cond-racecall
Memcheck:Cond
...
fun:racecall
}
{
go-value8-write_racecall
Memcheck:Value8
fun:__tsan_write
fun:racecall
}
{
go-value8-racecall
Memcheck:Value8
fun:_ZN6__tsan9ShadowSetEPNS_9RawShadowES1_S0_
fun:racecall
}
{
FI leak 8
Memcheck:Leak
Expand Down
10 changes: 8 additions & 2 deletions src/control/lib/control/rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,11 @@ func TestControl_InvokeUnaryRPCAsync(t *testing.T) {
"request timeout": {
timeout: 1 * time.Nanosecond,
req: &testRequest{
rpcFn: func(_ context.Context, _ *grpc.ClientConn) (proto.Message, error) {
rpcFn: func(ctx context.Context, _ *grpc.ClientConn) (proto.Message, error) {
time.Sleep(1 * time.Microsecond)
if ctx.Err() != nil {
return nil, ctx.Err()
}
return defaultMessage, nil
},
},
Expand All @@ -120,7 +123,10 @@ func TestControl_InvokeUnaryRPCAsync(t *testing.T) {
}
}(),
req: &testRequest{
rpcFn: func(_ context.Context, _ *grpc.ClientConn) (proto.Message, error) {
rpcFn: func(ctx context.Context, _ *grpc.ClientConn) (proto.Message, error) {
if ctx.Err() != nil {
return nil, ctx.Err()
}
time.Sleep(10 * time.Second) // shouldn't be allowed to run this long
return defaultMessage, nil
},
Expand Down
4 changes: 4 additions & 0 deletions src/control/logging/syslog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ func TestSyslogOutput(t *testing.T) {
t.Log("current user does not have permissions to view system log")
return
}
if _, err := syslog.New(syslog.LOG_ALERT, "test"); err != nil {
t.Logf("unable to connect to syslog: %s -- not running this test", err)
return
}

rand.Seed(time.Now().UnixNano())
runes := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
Expand Down
4 changes: 2 additions & 2 deletions src/control/provider/system/system_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestIsMounted(t *testing.T) {
expErr: errors.New("no such file or directory"),
},
"neither dir nor device": {
target: "/dev/log",
target: "/dev/stderr",
expErr: errors.New("not a valid mount target"),
},
} {
Expand Down Expand Up @@ -173,7 +173,7 @@ func TestSystemLinux_GetFsType(t *testing.T) {
expErr: syscall.ENOENT,
},
"temp dir": {
path: "/run",
path: "/dev",
expResult: &FsType{
Name: "tmpfs",
NoSUID: true,
Expand Down
88 changes: 53 additions & 35 deletions src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1914,10 +1914,16 @@ dtx_handle_resend(daos_handle_t coh, struct dtx_id *dti,
*/
#define DTX_EXEC_STEP_LENGTH DTX_THRESHOLD_COUNT

struct dtx_ult_arg {
struct dtx_chore {
struct dss_chore chore;
dtx_sub_func_t func;
void *func_arg;
struct dtx_leader_handle *dlh;

/* Chore-internal state variables */
uint32_t i;
uint32_t j;
uint32_t k;
};

static void
Expand Down Expand Up @@ -1970,28 +1976,42 @@ dtx_sub_comp_cb(struct dtx_leader_handle *dlh, int idx, int rc)
idx, tgt->st_rank, tgt->st_tgt_idx, tgt->st_flags, rc);
}

static void
dtx_leader_exec_ops_ult(void *arg)
static enum dss_chore_status
dtx_leader_exec_ops_chore(struct dss_chore *chore, bool is_reentrance)
{
struct dtx_ult_arg *ult_arg = arg;
struct dtx_leader_handle *dlh = ult_arg->dlh;
struct dtx_chore *dtx_chore = container_of(chore, struct dtx_chore, chore);
struct dtx_leader_handle *dlh = dtx_chore->dlh;
struct dtx_sub_status *sub;
struct daos_shard_tgt *tgt;
uint32_t i;
uint32_t j;
uint32_t k;
int rc = 0;

for (i = dlh->dlh_forward_idx, j = 0, k = 0; j < dlh->dlh_forward_cnt; i++, j++) {
sub = &dlh->dlh_subs[i];
/*
* If this is the first entrance, initialize the chore-internal state
* variables.
*/
if (is_reentrance) {
D_DEBUG(DB_TRACE, "%p: resume: i=%u j=%u k=%u forward_cnt=%u\n", chore,
dtx_chore->i, dtx_chore->j, dtx_chore->k, dlh->dlh_forward_cnt);
dtx_chore->i++;
dtx_chore->j++;
} else {
D_DEBUG(DB_TRACE, "%p: initialize: forward_idx=%u forward_cnt=%u\n", chore,
dlh->dlh_forward_idx, dlh->dlh_forward_cnt);
dtx_chore->i = dlh->dlh_forward_idx;
dtx_chore->j = 0;
dtx_chore->k = 0;
}

for (; dtx_chore->j < dlh->dlh_forward_cnt; dtx_chore->i++, dtx_chore->j++) {
sub = &dlh->dlh_subs[dtx_chore->i];
tgt = &sub->dss_tgt;

if (dlh->dlh_normal_sub_done == 0) {
sub->dss_result = 0;
sub->dss_comp = 0;

if (unlikely(tgt->st_flags & DTF_DELAY_FORWARD)) {
dtx_sub_comp_cb(dlh, i, 0);
dtx_sub_comp_cb(dlh, dtx_chore->i, 0);
continue;
}
} else {
Expand All @@ -2003,33 +2023,35 @@ dtx_leader_exec_ops_ult(void *arg)
}

if (tgt->st_rank == DAOS_TGT_IGNORE ||
(i == daos_fail_value_get() && DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE))) {
(dtx_chore->i == daos_fail_value_get() &&
DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE))) {
if (dlh->dlh_normal_sub_done == 0 || tgt->st_flags & DTF_DELAY_FORWARD)
dtx_sub_comp_cb(dlh, i, 0);
dtx_sub_comp_cb(dlh, dtx_chore->i, 0);
continue;
}

rc = ult_arg->func(dlh, ult_arg->func_arg, i, dtx_sub_comp_cb);
rc = dtx_chore->func(dlh, dtx_chore->func_arg, dtx_chore->i, dtx_sub_comp_cb);
if (rc != 0) {
if (sub->dss_comp == 0)
dtx_sub_comp_cb(dlh, i, rc);
dtx_sub_comp_cb(dlh, dtx_chore->i, rc);
break;
}

/* Yield to avoid holding CPU for too long time. */
if ((++k) % DTX_RPC_YIELD_THD == 0)
ABT_thread_yield();
if (++(dtx_chore->k) % DTX_RPC_YIELD_THD == 0)
return DSS_CHORE_YIELD;
}

if (rc != 0) {
for (i++, j++; j < dlh->dlh_forward_cnt; i++, j++) {
sub = &dlh->dlh_subs[i];
for (dtx_chore->i++, dtx_chore->j++; dtx_chore->j < dlh->dlh_forward_cnt;
dtx_chore->i++, dtx_chore->j++) {
sub = &dlh->dlh_subs[dtx_chore->i];
tgt = &sub->dss_tgt;

if (dlh->dlh_normal_sub_done == 0 || tgt->st_flags & DTF_DELAY_FORWARD) {
sub->dss_result = 0;
sub->dss_comp = 0;
dtx_sub_comp_cb(dlh, i, 0);
dtx_sub_comp_cb(dlh, dtx_chore->i, 0);
}
}
}
Expand All @@ -2039,6 +2061,8 @@ dtx_leader_exec_ops_ult(void *arg)
D_ASSERTF(rc == ABT_SUCCESS, "ABT_future_set failed [%u, %u), for delay %s: %d\n",
dlh->dlh_forward_idx, dlh->dlh_forward_idx + dlh->dlh_forward_cnt,
dlh->dlh_normal_sub_done == 1 ? "yes" : "no", rc);

return DSS_CHORE_DONE;
}

/**
Expand All @@ -2048,15 +2072,15 @@ int
dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg)
{
struct dtx_ult_arg ult_arg;
struct dtx_chore dtx_chore;
int sub_cnt = dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt;
int rc = 0;
int local_rc = 0;
int remote_rc = 0;

ult_arg.func = func;
ult_arg.func_arg = func_arg;
ult_arg.dlh = dlh;
dtx_chore.func = func;
dtx_chore.func_arg = func_arg;
dtx_chore.dlh = dlh;

dlh->dlh_result = 0;
dlh->dlh_allow_failure = allow_failure;
Expand Down Expand Up @@ -2092,15 +2116,10 @@ dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
D_GOTO(out, rc = dss_abterr2der(rc));
}

/*
* NOTE: Ideally, we probably should create ULT for each shard, but for performance
* reasons, let's only create one for all remote targets for now.
*/
rc = dss_ult_create(dtx_leader_exec_ops_ult, &ult_arg, DSS_XS_IOFW,
dss_get_module_info()->dmi_tgt_id, DSS_DEEP_STACK_SZ, NULL);
rc = dss_chore_delegate(&dtx_chore.chore, dtx_leader_exec_ops_chore);
if (rc != 0) {
D_ERROR("ult create failed [%u, %u] (2): "DF_RC"\n",
dlh->dlh_forward_idx, dlh->dlh_forward_cnt, DP_RC(rc));
DL_ERROR(rc, "chore create failed [%u, %u] (2)", dlh->dlh_forward_idx,
dlh->dlh_forward_cnt);
ABT_future_free(&dlh->dlh_future);
goto out;
}
Expand Down Expand Up @@ -2168,10 +2187,9 @@ dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func,
/* The ones without DELAY flag will be skipped when scan the targets array. */
dlh->dlh_forward_cnt = dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt;

rc = dss_ult_create(dtx_leader_exec_ops_ult, &ult_arg, DSS_XS_IOFW,
dss_get_module_info()->dmi_tgt_id, DSS_DEEP_STACK_SZ, NULL);
rc = dss_chore_delegate(&dtx_chore.chore, dtx_leader_exec_ops_chore);
if (rc != 0) {
D_ERROR("ult create failed (4): "DF_RC"\n", DP_RC(rc));
DL_ERROR(rc, "chore create failed (4)");
ABT_future_free(&dlh->dlh_future);
goto out;
}
Expand Down
Loading
Loading