Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-13380 engine: test tgt_nr #12537

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/control/server/ctl_storage_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1569,7 +1569,10 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) {
var engineCfgs []*engine.Config
for i, sc := range tc.storageCfgs {
log.Debugf("storage cfg contains bdevs %v for engine %d", sc.Bdevs(), i)
engineCfgs = append(engineCfgs, engine.MockConfig().WithStorage(sc...))
engineCfgs = append(engineCfgs,
engine.MockConfig().
WithStorage(sc...).
WithTargetCount(tc.engineTargetCount[i]))
}
sCfg := config.DefaultServer().WithEngines(engineCfgs...)
cs := mockControlService(t, log, sCfg, csbmbc, tc.smbc, tc.smsc)
Expand Down Expand Up @@ -1625,7 +1628,6 @@ func TestServer_CtlSvc_StorageScan_PostEngineStart(t *testing.T) {
}
te.setDrpcClient(newMockDrpcClient(dcc))
te._superblock.Rank = ranklist.NewRankPtr(uint32(idx + 1))
te.setTargetCount(tc.engineTargetCount[idx])
for _, tc := range te.storage.GetBdevConfigs() {
tc.Bdev.DeviceRoles.OptionBits = storage.OptionBits(storage.BdevRoleAll)
}
Expand Down
8 changes: 0 additions & 8 deletions src/control/server/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,6 @@ func (ei *EngineInstance) setHugepageSz(hpSizeMb int) {
ei.runner.GetConfig().HugepageSz = hpSizeMb
}

// setTargetCount updates target count in engine config.
func (ei *EngineInstance) setTargetCount(numTargets int) {
ei.Lock()
defer ei.Unlock()

ei.runner.GetConfig().TargetCount = numTargets
}

// GetTargetCount returns the target count set for this instance.
func (ei *EngineInstance) GetTargetCount() int {
ei.RLock()
Expand Down
6 changes: 0 additions & 6 deletions src/control/server/instance_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,6 @@ func (ei *EngineInstance) finishStartup(ctx context.Context, ready *srvpb.Notify
if err := ei.handleReady(ctx, ready); err != nil {
return err
}
// update engine target count to reflect allocated number of targets, not number requested
// when starting
// NOTE: Engine mem_size passed on engine invocation is based on the number of targets
// requested in config so if number of targets allocated doesn't match the number of
// targets requested the mem_size value may be inappropriate.
ei.setTargetCount(int(ready.GetNtgts()))

ei.ready.SetTrue()

Expand Down
99 changes: 43 additions & 56 deletions src/engine/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@
static char modules[MAX_MODULE_OPTIONS + 1];

/**
* Number of target threads the user would like to start
* 0 means default value, see dss_tgt_nr_get();
* Number of target threads the user would like to start.
*/
static unsigned int nr_threads;

Expand Down Expand Up @@ -250,56 +249,51 @@ modules_load(void)
return rc;
}

static unsigned int
ncores_needed(unsigned int tgt_nr, unsigned int nr_helpers)
{
return DAOS_TGT0_OFFSET + tgt_nr + nr_helpers;
}

/**
* Get the appropriate number of main XS based on the number of cores and
* passed in preferred number of threads.
* Check if the #targets and #nr_xs_helpers is valid to start server, the #nr_xs_helpers possibly
* be reduced.
*/
static int
dss_tgt_nr_get(unsigned int ncores, unsigned int nr, bool oversubscribe)
dss_tgt_nr_check(unsigned int ncores, unsigned int tgt_nr, bool oversubscribe)
{
int tgt_nr;

D_ASSERT(ncores >= 1);

/* at most 2 helper XS per target */
if (dss_tgt_offload_xs_nr > 2 * nr)
dss_tgt_offload_xs_nr = 2 * nr;
else if (dss_tgt_offload_xs_nr == 0)
if (dss_tgt_offload_xs_nr > 2 * tgt_nr) {
D_PRINT("#nr_xs_helpers(%d) cannot exceed 2 times #targets (2 x %d = %d).\n",
dss_tgt_offload_xs_nr, tgt_nr, 2 * tgt_nr);
dss_tgt_offload_xs_nr = 2 * tgt_nr;
} else if (dss_tgt_offload_xs_nr == 0) {
D_WARN("Suggest to config at least 1 helper XS per DAOS engine\n");
}

/* Each system XS uses one core, and with dss_tgt_offload_xs_nr
* offload XS. Calculate the tgt_nr as the number of main XS based
* on number of cores.
*/
retry:
tgt_nr = ncores - DAOS_TGT0_OFFSET - dss_tgt_offload_xs_nr;
if (tgt_nr <= 0)
tgt_nr = 1;

/* If user requires less target threads then set it as dss_tgt_nr,
* if user oversubscribes, then:
* . if oversubscribe is enabled, use the required number
* . if oversubscribe is disabled(default),
* use the number calculated above
* Note: oversubscribing may hurt performance.
*/
if (nr >= 1 && ((nr < tgt_nr) || oversubscribe)) {
tgt_nr = nr;
if (dss_tgt_offload_xs_nr > 2 * tgt_nr)
dss_tgt_offload_xs_nr = 2 * tgt_nr;
} else if (dss_tgt_offload_xs_nr > 2 * tgt_nr) {
dss_tgt_offload_xs_nr--;
goto retry;
if (oversubscribe) {
if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores)
D_PRINT("Force to start engine with %d targets %d xs_helpers on %d cores("
"%d cores reserved for system service).\n",
tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET);
goto out;
}

if (tgt_nr != nr)
D_PRINT("%d target XS(xstream) requested (#cores %d); "
"use (%d) target XS\n", nr, ncores, tgt_nr);
if (ncores_needed(tgt_nr, dss_tgt_offload_xs_nr) > ncores) {
D_ERROR("cannot start engine with %d targets %d xs_helpers on %d cores, may try "
"with DAOS_TARGET_OVERSUBSCRIBE=1 or reduce #targets/#nr_xs_helpers("
"%d cores reserved for system service).\n",
tgt_nr, dss_tgt_offload_xs_nr, ncores, DAOS_TGT0_OFFSET);
return -DER_INVAL;
}

out:
if (dss_tgt_offload_xs_nr % tgt_nr != 0)
dss_helper_pool = true;

return tgt_nr;
return 0;
}

static int
Expand All @@ -321,35 +315,30 @@ dss_topo_init()
depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
dss_tgt_nr = nr_threads;

/* if no NUMA node was specified, or NUMA data unavailable */
/* fall back to the legacy core allocation algorithm */
if (dss_numa_node == -1 || numa_node_nr <= 0) {
D_PRINT("Using legacy core allocation algorithm\n");
dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads,
tgt_oversub);

if (dss_core_offset >= dss_core_nr) {
D_ERROR("invalid dss_core_offset %u "
"(set by \"-f\" option),"
" should within range [0, %u]",
D_ERROR("invalid dss_core_offset %u (set by \"-f\" option), should within "
"range [0, %u]\n",
dss_core_offset, dss_core_nr - 1);
return -DER_INVAL;
}
return 0;

return dss_tgt_nr_check(dss_core_nr, dss_tgt_nr, tgt_oversub);
}

if (dss_numa_node > numa_node_nr) {
D_ERROR("Invalid NUMA node selected. "
"Must be no larger than %d\n",
numa_node_nr);
D_ERROR("Invalid NUMA node selected. Must be no larger than %d\n", numa_node_nr);
return -DER_INVAL;
}

numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology",
dss_numa_node);
D_ERROR("NUMA node %d was not found in the topology\n", dss_numa_node);
return -DER_INVAL;
}

Expand Down Expand Up @@ -381,17 +370,15 @@ dss_topo_init()
hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
free(cpuset);

dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads,
tgt_oversub);
if (dss_core_offset >= dss_num_cores_numa_node) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
"should within range [0, %d]", dss_core_offset,
dss_num_cores_numa_node - 1);
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), should within range "
"[0, %d]\n",
dss_core_offset, dss_num_cores_numa_node - 1);
return -DER_INVAL;
}

D_PRINT("Using NUMA core allocation algorithm\n");
return 0;

return dss_tgt_nr_check(dss_num_cores_numa_node, dss_tgt_nr, tgt_oversub);
}

static ABT_mutex server_init_state_mutex;
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/daos_agent_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
from apricot import TestWithServers
from apricot import TestWithServers, skipForTicket
from agent_utils import include_local_host
from exception_utils import CommandFailure

Expand All @@ -23,6 +23,7 @@ def __init__(self, *args, **kwargs):
self.setup_start_agents = False
self.setup_start_servers = False

@skipForTicket("DAOS-13380")
def test_daos_agent_config_basic(self):
"""
JIRA ID: DAOS-1508
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/daos_control_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""


from apricot import TestWithServers
from apricot import TestWithServers, skipForTicket
from exception_utils import CommandFailure


Expand All @@ -16,6 +16,7 @@ class DaosControlConfigTest(TestWithServers):
:avocado: recursive
"""

@skipForTicket("DAOS-13380")
def test_daos_control_config_basic(self):
"""
JIRA ID: DAOS-1508
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/control/daos_system_query.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ server_config:
engines:
0:
pinned_numa_node: 0
targets: 4
nr_xs_helpers: 1
fabric_iface_port: 31416
log_file: daos_server0.log
Expand Down
6 changes: 3 additions & 3 deletions src/tests/ftest/control/dmg_pool_query_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 8
targets: 4
storage: auto
system_ram_reserved: 64
pool:
Expand All @@ -26,8 +26,8 @@ ior:

exp_vals:
pool_status: 0
total_targets: 8
active_targets: 8
total_targets: 4
active_targets: 4
total_engines: 1
disabled_targets: 0
version: 1
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/control/dmg_server_set_logmasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 4
storage:
0:
class: ram
Expand Down
4 changes: 3 additions & 1 deletion src/tests/ftest/control/ms_resilience.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ClusterShell.NodeSet import NodeSet

from apricot import TestWithServers
from apricot import TestWithServers, skipForTicket
from run_utils import stop_processes


Expand Down Expand Up @@ -323,6 +323,7 @@ def test_ms_resilience_2(self):
# Run test cases
self.verify_retained_quorum(2)

@skipForTicket("DAOS-13380")
def test_ms_resilience_3(self):
"""
JIRA ID: DAOS-3798
Expand All @@ -340,6 +341,7 @@ def test_ms_resilience_3(self):
# Run test case
self.verify_regained_quorum(1)

@skipForTicket("DAOS-13380")
def test_ms_resilience_4(self):
"""
JIRA ID: DAOS-3798
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/control/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
import json

from apricot import TestWithServers
from apricot import TestWithServers, skipForTicket
from general_utils import run_pcmd, report_errors
from server_utils_base import DaosServerCommandRunner

Expand All @@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
self.setup_start_servers = False
self.setup_start_agents = False

@skipForTicket("DAOS-13380")
def test_version(self):
"""Verify version number for dmg, daos, daos_server, and daos_agent against RPM.

Expand Down
2 changes: 2 additions & 0 deletions src/tests/ftest/dfuse/daos_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from dfuse_test_base import DfuseTestBase
from run_utils import run_remote
from apricot import skipForTicket


class DaosBuild(DfuseTestBase):
Expand Down Expand Up @@ -49,6 +50,7 @@ def test_dfuse_daos_build_wt(self):
"""
self.run_build_test("writethrough")

@skipForTicket("DAOS-13380")
def test_dfuse_daos_build_wt_il(self):
"""This test builds DAOS on a dfuse filesystem.

Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/harness/core_files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 4
storage:
0:
class: ram
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/pool/create_all_vm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 5
targets: 4
nr_xs_helpers: 0
storage:
0:
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/pool/query_attribute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 8
targets: 4
nr_xs_helpers: 0
storage:
0:
Expand Down
5 changes: 0 additions & 5 deletions src/tests/ftest/server/daos_server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,6 @@ server_config_val: !mux
- "nr_xs_helpers"
- -10000
- "PASS"
targets_negative:
config_val:
- "targets"
- -1
- "PASS"
targets_str:
config_val:
- "targets"
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/telemetry/dkey_akey_enum_punch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ server_config:
engines_per_host: 1
engines:
0:
targets: 8
targets: 4
nr_xs_helpers: 0
storage:
0:
Expand Down
1 change: 1 addition & 0 deletions utils/nlt_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ engines:
- DAOS_MD_CAP=1024
- DAOS_STRICT_SHUTDOWN=1
- CRT_CTX_SHARE_ADDR=0
- DAOS_TARGET_OVERSUBSCRIBE=1
- ABT_STACK_OVERFLOW_CHECK=mprotect
storage:
-
Expand Down