diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py index da112bb82..f248741ca 100644 --- a/tests_e2e/tests/lib/cgroup_helpers.py +++ b/tests_e2e/tests/lib/cgroup_helpers.py @@ -144,12 +144,18 @@ def check_agent_quota_disabled(): # Ubuntu 16 has an issue in expressing no quota as "infinity" https://github.com/systemd/systemd/issues/5965, so we are directly checking the quota value in cpu controller return cpu_quota == 'infinity' or get_unit_cgroup_cpu_quota_disabled(AGENT_SERVICE_NAME) -def check_cgroup_disabled_with_unknown_process(): +def check_cgroup_disabled_due_to_systemd_error(): """ - Returns True if the cgroup is disabled with unknown process - """ - return check_log_message("Disabling resource usage monitoring. Reason: Check on cgroups failed:.+UNKNOWN") + Returns True if the cgroup is disabled due to systemd error (Connection reset by peer) + + Ex: + 2024-12-18T06:43:23.867711Z INFO ExtHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Failed to start Microsoft.Azure.Extensions.Edp.GATestExtGo-1.2.0.0 using systemd-run, will try invoking the extension directly. Error: [SystemdRunError] Systemd process exited with code 1 and output [stdout] + [stderr] + Warning! D-Bus connection terminated. + Failed to start transient scope unit: Connection reset by peer + """ + return check_log_message("Failed to start.+using systemd-run, will try invoking the extension directly.+[SystemdRunError].+Connection reset by peer") def check_log_message(message, after_timestamp=datetime.datetime.min): """ diff --git a/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py b/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py index 2cd6d9492..84d8230e5 100755 --- a/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py +++ b/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py @@ -25,7 +25,7 @@ from tests_e2e.tests.lib.cgroup_helpers import verify_if_distro_supports_cgroup, \ verify_agent_cgroup_assigned_correctly, BASE_CGROUP, EXT_CONTROLLERS, get_unit_cgroup_mount_path, \ GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, check_agent_quota_disabled, \ - check_cgroup_disabled_with_unknown_process, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \ + check_cgroup_disabled_due_to_systemd_error, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \ print_cgroups from tests_e2e.tests.lib.logging import log from tests_e2e.tests.lib.retry import retry_if_false @@ -213,8 +213,8 @@ def main(): try: main() except Exception as e: - # It is possible that agent cgroup can be disabled due to UNKNOWN process or throttled before we run this check, in that case, we should ignore the validation - if check_cgroup_disabled_with_unknown_process() and retry_if_false(check_agent_quota_disabled): - log.info("Cgroup is disabled due to UNKNOWN process, ignoring ext cgroups validations") + # It is possible that agent cgroup can be disabled and reset the quotas if the extension failed to start using systemd-run. In that case, we should ignore the validation + if check_cgroup_disabled_due_to_systemd_error() and retry_if_false(check_agent_quota_disabled): + log.info("Cgroup is disabled due to systemd error while invoking the extension, ignoring ext cgroups validations") else: raise