Skip to content

Commit

Permalink
DAOS-13672 control: Bump system_ram_reserved to reduce OOM occurrences (
Browse files Browse the repository at this point in the history
#12430)

Attempt to reduce the chance of OOM killer terminating an engine
process when maximum pool space is allocated by slightly increasing
the system_ram_reserved default value from 6->16gib.

Some test yaml system_ram_reserved values have been reduced to 6 to
prevent the increase in the default from causing an available memory
check failure on engine start-up in memory constrained (VM)
environments. Increasing the default value should also resolve
DAOS-13918 by providing a larger memory buffer to reduce the chance
of intermittent failures related to this check.

Signed-off-by: Tom Nabarro <tom.nabarro@intel.com>
  • Loading branch information
tanabarr authored Aug 1, 2023
1 parent 216ee26 commit f9f935e
Show file tree
Hide file tree
Showing 11 changed files with 19 additions and 13 deletions.
8 changes: 4 additions & 4 deletions src/control/cmd/dmg/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,17 +141,17 @@ func TestAuto_confGen(t *testing.T) {
Message: control.MockServerScanResp(t, "withSpaceUsage"),
}
storRespHighMem := control.MockServerScanResp(t, "withSpaceUsage")
// Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 6GiB sys rsvd,
// Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 16GiB sys rsvd,
// 5GiB per engine for tmpfs.
storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 6 + 10)) / humanize.KiByte
storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 16 + 10)) / humanize.KiByte
mockRamdiskSize := 5
storHostRespHighMem := &control.HostResponse{
Addr: "host1",
Message: storRespHighMem,
}
e0 := control.MockEngineCfg(0, 2, 4, 6, 8).WithHelperStreamCount(4)
e1 := control.MockEngineCfg(1, 1, 3, 5, 7).WithHelperStreamCount(4)
exmplEngineCfgs := []*engine.Config{e0, e1}
mockRamdiskSize := 5 // RoundDownGiB(16*0.75/2)
metadataMountPath := "/mnt/daos_md"
controlMetadata := storage.ControlMetadata{
Path: metadataMountPath,
Expand Down Expand Up @@ -406,7 +406,7 @@ disable_vfio: false
disable_vmd: false
enable_hotplug: false
nr_hugepages: 6144
system_ram_reserved: 6
system_ram_reserved: 16
disable_hugepages: false
control_log_mask: INFO
control_log_file: /tmp/daos_server.log
Expand Down
2 changes: 1 addition & 1 deletion src/control/lib/control/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,7 @@ func TestControl_AutoConfig_genConfig(t *testing.T) {
MockEngineCfgTmpfs(1, 0, mockBdevTier(1, 3), mockBdevTier(1, 4, 5)),
},
hpSize: defHpSizeKb,
memTotal: (54 * humanize.GiByte) / humanize.KiByte,
memTotal: (64 * humanize.GiByte) / humanize.KiByte,
expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider,
[]*engine.Config{
MockEngineCfgTmpfs(0, 5, /* tmpfs size in gib */
Expand Down
2 changes: 1 addition & 1 deletion src/control/server/storage/scm.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const (

// Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine.
const (
DefaultSysMemRsvd = humanize.GiByte * 6 // per-system
DefaultSysMemRsvd = humanize.GiByte * 16 // per-system
DefaultTgtMemRsvd = humanize.MiByte * 128 // per-engine-target
DefaultEngineMemRsvd = humanize.GiByte * 1 // per-engine
)
Expand Down
8 changes: 4 additions & 4 deletions src/control/server/storage/scm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,28 @@ func Test_CalcRamdiskSize(t *testing.T) {
expErr: errors.New("requires positive nonzero nr engines"),
},
"default values; low mem": {
memTotal: humanize.GiByte * 20,
memTotal: humanize.GiByte * 30,
memHuge: humanize.GiByte * 14,
memSys: DefaultSysMemRsvd,
tgtCount: 8,
engCount: 1,
expErr: errors.New("insufficient ram"), // 20 - (14+6+1) = -1
expErr: errors.New("insufficient ram"), // 30 - (14+16+1) = -1
},
"default values; high mem": {
memTotal: humanize.GiByte * 60,
memHuge: humanize.GiByte * 30,
memSys: DefaultSysMemRsvd,
tgtCount: 16,
engCount: 2,
expSize: humanize.GiByte * 10, // (60 - (30+6+4)) / 2
expSize: humanize.GiByte * 5, // (60 - (30+16+4)) / 2
},
"default values; low nr targets": {
memTotal: humanize.GiByte * 60,
memHuge: humanize.GiByte * 30,
memSys: DefaultSysMemRsvd,
tgtCount: 1,
engCount: 2,
expSize: humanize.GiByte * 11, // (60 - (30+6+2)) / 2
expSize: humanize.GiByte * 6, // (60 - (30+16+2)) / 2
},
"custom values; low sys reservation": {
memTotal: humanize.GiByte * 60,
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/container/snapshot_aggregation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ server_config:
fabric_iface_port: 31417
log_file: daos_server1.log
storage: auto
system_ram_reserved: 8
pool:
control_method: dmg
scm_size: 80G
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/control/dmg_server_set_logmasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ timeout: 120
server_config:
name: daos_server
engines_per_host: 1
system_ram_reserved: 6
engines:
0:
targets: 4
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/pool/create_all_vm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ test_two_pools:
server_config:
name: daos_server
engines_per_host: 1
system_ram_reserved: 6
engines:
0:
targets: 4
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/security/cont_overwrite_acl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ timeout: 120
server_config:
name: daos_server
engines_per_host: 1
system_ram_reserved: 6
engines:
0:
targets: 4
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/security/cont_update_acl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ timeout: 120
server_config:
name: daos_server
engines_per_host: 1
system_ram_reserved: 6
engines:
0:
targets: 4
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/telemetry/pool_space_metrics.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
hosts:
test_servers: 2
test_clients: 1
timeout: 120
timeout: 180
server_config:
name: daos_server
engines_per_host: 2
Expand Down
5 changes: 3 additions & 2 deletions utils/config/daos_server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,10 @@
## of RAM resulting in MemAvailable value being too low to support the calculated RAM-disk size
## increasing the value will reduce the calculate size. Alternatively in situations where total
## RAM is low, reducing the value may prevent problems where RAM-disk size calculated is below the
## minimum of 4gib.
## minimum of 4gib. Increasing the value may help avoid the potential of OOM killer terminating
## engine processes but could also result in stopping DAOS from using available memory resources.
#
## default: 6
## default: 16
#system_ram_reserved: 5
#
#
Expand Down

0 comments on commit f9f935e

Please sign in to comment.