From 42792025e9d1eb75374e53528bd1c2be1f1ce823 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 11 Jul 2023 17:06:34 +0100 Subject: [PATCH 1/6] DAOS-13672 control: Bump system_ram_reserved to reduce OOM occurrences Test-tag: pr daily_regression Test-nvme: auto_md_on_ssd Required-githooks: true Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/auto_test.go | 8 ++++---- src/control/lib/control/auto_test.go | 4 ++-- src/control/server/storage/scm.go | 2 +- src/control/server/storage/scm_test.go | 4 ++-- src/tests/ftest/control/dmg_server_set_logmasks.yaml | 1 + src/tests/ftest/pool/create_all_vm.yaml | 1 + src/tests/ftest/security/cont_overwrite_acl.yaml | 1 + src/tests/ftest/security/cont_update_acl.yaml | 1 + utils/config/daos_server.yml | 5 +++-- 9 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index 013187f8f8b..d86488953ca 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -141,9 +141,10 @@ func TestAuto_confGen(t *testing.T) { Message: control.MockServerScanResp(t, "withSpaceUsage"), } storRespHighMem := control.MockServerScanResp(t, "withSpaceUsage") - // Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 6GiB sys rsvd, + // Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 8GiB sys rsvd, // 5GiB per engine for tmpfs. - storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 6 + 10)) / humanize.KiByte + storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 8 + 10)) / humanize.KiByte + mockRamdiskSize := 5 storHostRespHighMem := &control.HostResponse{ Addr: "host1", Message: storRespHighMem, @@ -151,7 +152,6 @@ func TestAuto_confGen(t *testing.T) { e0 := control.MockEngineCfg(0, 2, 4, 6, 8).WithHelperStreamCount(4) e1 := control.MockEngineCfg(1, 1, 3, 5, 7).WithHelperStreamCount(4) exmplEngineCfgs := []*engine.Config{e0, e1} - mockRamdiskSize := 5 // RoundDownGiB(16*0.75/2) metadataMountPath := "/mnt/daos_md" controlMetadata := storage.ControlMetadata{ Path: metadataMountPath, @@ -406,7 +406,7 @@ disable_vfio: false disable_vmd: false enable_hotplug: false nr_hugepages: 6144 -system_ram_reserved: 6 +system_ram_reserved: 8 disable_hugepages: false control_log_mask: INFO control_log_file: /tmp/daos_server.log diff --git a/src/control/lib/control/auto_test.go b/src/control/lib/control/auto_test.go index 8e282d73f89..f7edcf421a3 100644 --- a/src/control/lib/control/auto_test.go +++ b/src/control/lib/control/auto_test.go @@ -1576,7 +1576,7 @@ func TestControl_AutoConfig_genConfig(t *testing.T) { memTotal: (54 * humanize.GiByte) / humanize.KiByte, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ - MockEngineCfgTmpfs(0, 5, /* tmpfs size in gib */ + MockEngineCfgTmpfs(0, 4, /* tmpfs size in gib */ mockBdevTier(0, 0).WithBdevDeviceRoles(4), mockBdevTier(0, 1, 2).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). @@ -1585,7 +1585,7 @@ func TestControl_AutoConfig_genConfig(t *testing.T) { filepath.Join(controlMetadata.EngineDirectory(0), storage.BdevOutConfName), ), - MockEngineCfgTmpfs(1, 5, /* tmpfs size in gib */ + MockEngineCfgTmpfs(1, 4, /* tmpfs size in gib */ mockBdevTier(1, 3).WithBdevDeviceRoles(4), mockBdevTier(1, 4, 5).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). diff --git a/src/control/server/storage/scm.go b/src/control/server/storage/scm.go index b8a440fdedb..428d06fffcf 100644 --- a/src/control/server/storage/scm.go +++ b/src/control/server/storage/scm.go @@ -51,7 +51,7 @@ const ( // Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine. const ( - DefaultSysMemRsvd = humanize.GiByte * 6 // per-system + DefaultSysMemRsvd = humanize.GiByte * 8 // per-system DefaultTgtMemRsvd = humanize.MiByte * 128 // per-engine-target DefaultEngineMemRsvd = humanize.GiByte * 1 // per-engine ) diff --git a/src/control/server/storage/scm_test.go b/src/control/server/storage/scm_test.go index b3f6d4327c6..80e59a340b7 100644 --- a/src/control/server/storage/scm_test.go +++ b/src/control/server/storage/scm_test.go @@ -52,7 +52,7 @@ func Test_CalcRamdiskSize(t *testing.T) { memSys: DefaultSysMemRsvd, tgtCount: 16, engCount: 2, - expSize: humanize.GiByte * 10, // (60 - (30+6+4)) / 2 + expSize: humanize.GiByte * 9, // (60 - (30+8+4)) / 2 }, "default values; low nr targets": { memTotal: humanize.GiByte * 60, @@ -60,7 +60,7 @@ func Test_CalcRamdiskSize(t *testing.T) { memSys: DefaultSysMemRsvd, tgtCount: 1, engCount: 2, - expSize: humanize.GiByte * 11, // (60 - (30+6+2)) / 2 + expSize: humanize.GiByte * 10, // (60 - (30+8+2)) / 2 }, "custom values; low sys reservation": { memTotal: humanize.GiByte * 60, diff --git a/src/tests/ftest/control/dmg_server_set_logmasks.yaml b/src/tests/ftest/control/dmg_server_set_logmasks.yaml index b4e3a1ddfeb..29b8e5326e1 100644 --- a/src/tests/ftest/control/dmg_server_set_logmasks.yaml +++ b/src/tests/ftest/control/dmg_server_set_logmasks.yaml @@ -4,6 +4,7 @@ timeout: 120 server_config: name: daos_server engines_per_host: 1 + system_ram_reserved: 6 engines: 0: storage: diff --git a/src/tests/ftest/pool/create_all_vm.yaml b/src/tests/ftest/pool/create_all_vm.yaml index 0e030ee8cbc..4daa7d3ea46 100644 --- a/src/tests/ftest/pool/create_all_vm.yaml +++ b/src/tests/ftest/pool/create_all_vm.yaml @@ -30,6 +30,7 @@ test_two_pools: server_config: name: daos_server engines_per_host: 1 + system_ram_reserved: 6 engines: 0: targets: 5 diff --git a/src/tests/ftest/security/cont_overwrite_acl.yaml b/src/tests/ftest/security/cont_overwrite_acl.yaml index dd5f4cabd79..83974c98fd6 100644 --- a/src/tests/ftest/security/cont_overwrite_acl.yaml +++ b/src/tests/ftest/security/cont_overwrite_acl.yaml @@ -7,6 +7,7 @@ timeout: 120 server_config: name: daos_server engines_per_host: 1 + system_ram_reserved: 6 engines: 0: targets: 4 diff --git a/src/tests/ftest/security/cont_update_acl.yaml b/src/tests/ftest/security/cont_update_acl.yaml index 65091e2b06a..78c114e7e27 100644 --- a/src/tests/ftest/security/cont_update_acl.yaml +++ b/src/tests/ftest/security/cont_update_acl.yaml @@ -7,6 +7,7 @@ timeout: 120 server_config: name: daos_server engines_per_host: 1 + system_ram_reserved: 6 engines: 0: targets: 4 diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index 3fba33dda67..86544779619 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -235,9 +235,10 @@ ## of RAM resulting in MemAvailable value being too low to support the calculated RAM-disk size ## increasing the value will reduce the calculate size. Alternatively in situations where total ## RAM is low, reducing the value may prevent problems where RAM-disk size calculated is below the -## minimum of 4gib. +## minimum of 4gib. Increasing the value may help avoid the potential of OOM killer terminating +## engine processes but could also result in stopping DAOS from using available memory resources. # -## default: 6 +## default: 8 #system_ram_reserved: 5 # # From a837b3f81946759f7d2a96ff153974c225e7a23e Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Wed, 19 Jul 2023 16:12:19 +0100 Subject: [PATCH 2/6] increase system_ram_reserved Value for some tests experiencing insufficient memory failures in CI Skip-func-test-vm: true Test-tag: pr daily_regression Test-nvme: auto_md_on_ssd Required-githooks: true Signed-off-by: Tom Nabarro --- src/tests/ftest/control/config_generate_run.yaml | 1 + src/tests/ftest/control/dmg_network_scan.yaml | 1 + src/tests/ftest/server/daos_server_config.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/src/tests/ftest/control/config_generate_run.yaml b/src/tests/ftest/control/config_generate_run.yaml index e834d889b73..27f2419f286 100644 --- a/src/tests/ftest/control/config_generate_run.yaml +++ b/src/tests/ftest/control/config_generate_run.yaml @@ -2,6 +2,7 @@ hosts: test_servers: 1 timeout: 250 server_config: + system_ram_reserved: 16 engines_per_host: 1 engines: 0: diff --git a/src/tests/ftest/control/dmg_network_scan.yaml b/src/tests/ftest/control/dmg_network_scan.yaml index 9efbd1715a8..ffec4e71b2e 100644 --- a/src/tests/ftest/control/dmg_network_scan.yaml +++ b/src/tests/ftest/control/dmg_network_scan.yaml @@ -8,6 +8,7 @@ server_config: port: 10001 control_log_mask: TRACE engines_per_host: 1 + system_ram_reserved: 16 engines: 0: storage: diff --git a/src/tests/ftest/server/daos_server_config.yaml b/src/tests/ftest/server/daos_server_config.yaml index 0a659990306..93613527b33 100644 --- a/src/tests/ftest/server/daos_server_config.yaml +++ b/src/tests/ftest/server/daos_server_config.yaml @@ -6,6 +6,7 @@ timeout: 130 server_config: name: daos_server engines_per_host: 1 + system_ram_reserved: 16 engines: 0: storage: From 5ef49617e7e587cc70dfc88a4e56b206067d0d24 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 24 Jul 2023 16:22:56 +0100 Subject: [PATCH 3/6] increase telemetry test timeout to resolve CI failures Test-tag: test_telemetry_pool_metrics Required-githooks: true Signed-off-by: Tom Nabarro --- src/tests/ftest/telemetry/telemetry_pool_metrics.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml b/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml index f7716199385..48fa0157f46 100644 --- a/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml +++ b/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml @@ -1,7 +1,7 @@ hosts: test_servers: 4 test_clients: 1 -timeout: 240 +timeout: 300 server_config: name: daos_server engines_per_host: 1 From 5046edb28cc101ef690259a0011e6f923747484b Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 24 Jul 2023 23:13:59 +0100 Subject: [PATCH 4/6] correct test timeout change required to resolve CI failures Test-tag: test_telemetry_pool_space_metrics Required-githooks: true Signed-off-by: Tom Nabarro --- src/tests/ftest/telemetry/pool_space_metrics.yaml | 2 +- src/tests/ftest/telemetry/telemetry_pool_metrics.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/ftest/telemetry/pool_space_metrics.yaml b/src/tests/ftest/telemetry/pool_space_metrics.yaml index d041e936c54..459e2a1954d 100644 --- a/src/tests/ftest/telemetry/pool_space_metrics.yaml +++ b/src/tests/ftest/telemetry/pool_space_metrics.yaml @@ -1,7 +1,7 @@ hosts: test_servers: 2 test_clients: 1 -timeout: 120 +timeout: 180 server_config: name: daos_server engines_per_host: 2 diff --git a/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml b/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml index 48fa0157f46..f7716199385 100644 --- a/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml +++ b/src/tests/ftest/telemetry/telemetry_pool_metrics.yaml @@ -1,7 +1,7 @@ hosts: test_servers: 4 test_clients: 1 -timeout: 300 +timeout: 240 server_config: name: daos_server engines_per_host: 1 From 3f3692db6957b90fad861e1cd1cfa73a7e1af73a Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Wed, 26 Jul 2023 08:00:39 +0100 Subject: [PATCH 5/6] increase system_ram_reserved default to 16 as per code review comment Required-githooks: true Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/auto_test.go | 6 +++--- src/control/lib/control/auto_test.go | 6 +++--- src/control/server/storage/scm.go | 2 +- src/control/server/storage/scm_test.go | 8 ++++---- src/tests/ftest/control/config_generate_run.yaml | 1 - src/tests/ftest/control/dmg_network_scan.yaml | 1 - src/tests/ftest/server/daos_server_config.yaml | 1 - utils/config/daos_server.yml | 2 +- 8 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index d86488953ca..d3406f4d61e 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -141,9 +141,9 @@ func TestAuto_confGen(t *testing.T) { Message: control.MockServerScanResp(t, "withSpaceUsage"), } storRespHighMem := control.MockServerScanResp(t, "withSpaceUsage") - // Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 8GiB sys rsvd, + // Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 16GiB sys rsvd, // 5GiB per engine for tmpfs. - storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 8 + 10)) / humanize.KiByte + storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 16 + 10)) / humanize.KiByte mockRamdiskSize := 5 storHostRespHighMem := &control.HostResponse{ Addr: "host1", @@ -406,7 +406,7 @@ disable_vfio: false disable_vmd: false enable_hotplug: false nr_hugepages: 6144 -system_ram_reserved: 8 +system_ram_reserved: 16 disable_hugepages: false control_log_mask: INFO control_log_file: /tmp/daos_server.log diff --git a/src/control/lib/control/auto_test.go b/src/control/lib/control/auto_test.go index f7edcf421a3..93a1d8ccd2a 100644 --- a/src/control/lib/control/auto_test.go +++ b/src/control/lib/control/auto_test.go @@ -1573,10 +1573,10 @@ func TestControl_AutoConfig_genConfig(t *testing.T) { MockEngineCfgTmpfs(1, 0, mockBdevTier(1, 3), mockBdevTier(1, 4, 5)), }, hpSize: defHpSizeKb, - memTotal: (54 * humanize.GiByte) / humanize.KiByte, + memTotal: (64 * humanize.GiByte) / humanize.KiByte, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ - MockEngineCfgTmpfs(0, 4, /* tmpfs size in gib */ + MockEngineCfgTmpfs(0, 5, /* tmpfs size in gib */ mockBdevTier(0, 0).WithBdevDeviceRoles(4), mockBdevTier(0, 1, 2).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). @@ -1585,7 +1585,7 @@ func TestControl_AutoConfig_genConfig(t *testing.T) { filepath.Join(controlMetadata.EngineDirectory(0), storage.BdevOutConfName), ), - MockEngineCfgTmpfs(1, 4, /* tmpfs size in gib */ + MockEngineCfgTmpfs(1, 5, /* tmpfs size in gib */ mockBdevTier(1, 3).WithBdevDeviceRoles(4), mockBdevTier(1, 4, 5).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). diff --git a/src/control/server/storage/scm.go b/src/control/server/storage/scm.go index 428d06fffcf..3296575d54b 100644 --- a/src/control/server/storage/scm.go +++ b/src/control/server/storage/scm.go @@ -51,7 +51,7 @@ const ( // Memory reservation constant defaults to be used when calculating RAM-disk size for DAOS I/O engine. const ( - DefaultSysMemRsvd = humanize.GiByte * 8 // per-system + DefaultSysMemRsvd = humanize.GiByte * 16 // per-system DefaultTgtMemRsvd = humanize.MiByte * 128 // per-engine-target DefaultEngineMemRsvd = humanize.GiByte * 1 // per-engine ) diff --git a/src/control/server/storage/scm_test.go b/src/control/server/storage/scm_test.go index 80e59a340b7..08e8638f8f4 100644 --- a/src/control/server/storage/scm_test.go +++ b/src/control/server/storage/scm_test.go @@ -39,12 +39,12 @@ func Test_CalcRamdiskSize(t *testing.T) { expErr: errors.New("requires positive nonzero nr engines"), }, "default values; low mem": { - memTotal: humanize.GiByte * 20, + memTotal: humanize.GiByte * 30, memHuge: humanize.GiByte * 14, memSys: DefaultSysMemRsvd, tgtCount: 8, engCount: 1, - expErr: errors.New("insufficient ram"), // 20 - (14+6+1) = -1 + expErr: errors.New("insufficient ram"), // 30 - (14+16+1) = -1 }, "default values; high mem": { memTotal: humanize.GiByte * 60, @@ -52,7 +52,7 @@ func Test_CalcRamdiskSize(t *testing.T) { memSys: DefaultSysMemRsvd, tgtCount: 16, engCount: 2, - expSize: humanize.GiByte * 9, // (60 - (30+8+4)) / 2 + expSize: humanize.GiByte * 5, // (60 - (30+16+4)) / 2 }, "default values; low nr targets": { memTotal: humanize.GiByte * 60, @@ -60,7 +60,7 @@ func Test_CalcRamdiskSize(t *testing.T) { memSys: DefaultSysMemRsvd, tgtCount: 1, engCount: 2, - expSize: humanize.GiByte * 10, // (60 - (30+8+2)) / 2 + expSize: humanize.GiByte * 6, // (60 - (30+16+2)) / 2 }, "custom values; low sys reservation": { memTotal: humanize.GiByte * 60, diff --git a/src/tests/ftest/control/config_generate_run.yaml b/src/tests/ftest/control/config_generate_run.yaml index 27f2419f286..e834d889b73 100644 --- a/src/tests/ftest/control/config_generate_run.yaml +++ b/src/tests/ftest/control/config_generate_run.yaml @@ -2,7 +2,6 @@ hosts: test_servers: 1 timeout: 250 server_config: - system_ram_reserved: 16 engines_per_host: 1 engines: 0: diff --git a/src/tests/ftest/control/dmg_network_scan.yaml b/src/tests/ftest/control/dmg_network_scan.yaml index ffec4e71b2e..9efbd1715a8 100644 --- a/src/tests/ftest/control/dmg_network_scan.yaml +++ b/src/tests/ftest/control/dmg_network_scan.yaml @@ -8,7 +8,6 @@ server_config: port: 10001 control_log_mask: TRACE engines_per_host: 1 - system_ram_reserved: 16 engines: 0: storage: diff --git a/src/tests/ftest/server/daos_server_config.yaml b/src/tests/ftest/server/daos_server_config.yaml index 93613527b33..0a659990306 100644 --- a/src/tests/ftest/server/daos_server_config.yaml +++ b/src/tests/ftest/server/daos_server_config.yaml @@ -6,7 +6,6 @@ timeout: 130 server_config: name: daos_server engines_per_host: 1 - system_ram_reserved: 16 engines: 0: storage: diff --git a/utils/config/daos_server.yml b/utils/config/daos_server.yml index 86544779619..73772a44ed4 100644 --- a/utils/config/daos_server.yml +++ b/utils/config/daos_server.yml @@ -238,7 +238,7 @@ ## minimum of 4gib. Increasing the value may help avoid the potential of OOM killer terminating ## engine processes but could also result in stopping DAOS from using available memory resources. # -## default: 8 +## default: 16 #system_ram_reserved: 5 # # From a25515c0313bcd6951d805e94bb9d96a10b85f45 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Thu, 27 Jul 2023 12:33:14 +0100 Subject: [PATCH 6/6] attempt to resolve snapshot aggregation test failure Skip-func-test-vm: true Test-tag: test_snapshot_aggregation Test-nvme: auto_md_on_ssd Required-githooks: true Signed-off-by: Tom Nabarro --- src/tests/ftest/container/snapshot_aggregation.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/ftest/container/snapshot_aggregation.yaml b/src/tests/ftest/container/snapshot_aggregation.yaml index 5b63868d6dc..8c4fa97c7fc 100644 --- a/src/tests/ftest/container/snapshot_aggregation.yaml +++ b/src/tests/ftest/container/snapshot_aggregation.yaml @@ -21,6 +21,7 @@ server_config: fabric_iface_port: 31417 log_file: daos_server1.log storage: auto + system_ram_reserved: 8 pool: control_method: dmg scm_size: 80G