Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

suppress systemd-run error(Connection reset by peer) while validating ext cgroups #3278

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions tests_e2e/tests/lib/cgroup_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,18 @@ def check_agent_quota_disabled():
# Ubuntu 16 has an issue in expressing no quota as "infinity" https://github.com/systemd/systemd/issues/5965, so we are directly checking the quota value in cpu controller
return cpu_quota == 'infinity' or get_unit_cgroup_cpu_quota_disabled(AGENT_SERVICE_NAME)

def check_cgroup_disabled_with_unknown_process():
def check_cgroup_disabled_due_to_systemd_error():
"""
Returns True if the cgroup is disabled with unknown process
"""
return check_log_message("Disabling resource usage monitoring. Reason: Check on cgroups failed:.+UNKNOWN")
Returns True if the cgroup is disabled due to systemd error (Connection reset by peer)

Ex:
2024-12-18T06:43:23.867711Z INFO ExtHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Failed to start Microsoft.Azure.Extensions.Edp.GATestExtGo-1.2.0.0 using systemd-run, will try invoking the extension directly. Error: [SystemdRunError] Systemd process exited with code 1 and output [stdout]

[stderr]
Warning! D-Bus connection terminated.
Failed to start transient scope unit: Connection reset by peer
"""
return check_log_message("Failed to start.+using systemd-run, will try invoking the extension directly.+[SystemdRunError].+Connection reset by peer")

def check_log_message(message, after_timestamp=datetime.datetime.min):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from tests_e2e.tests.lib.cgroup_helpers import verify_if_distro_supports_cgroup, \
verify_agent_cgroup_assigned_correctly, BASE_CGROUP, EXT_CONTROLLERS, get_unit_cgroup_mount_path, \
GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, check_agent_quota_disabled, \
check_cgroup_disabled_with_unknown_process, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \
check_cgroup_disabled_due_to_systemd_error, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \
print_cgroups
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.retry import retry_if_false
Expand Down Expand Up @@ -213,8 +213,8 @@ def main():
try:
main()
except Exception as e:
# It is possible that agent cgroup can be disabled due to UNKNOWN process or throttled before we run this check, in that case, we should ignore the validation
if check_cgroup_disabled_with_unknown_process() and retry_if_false(check_agent_quota_disabled):
log.info("Cgroup is disabled due to UNKNOWN process, ignoring ext cgroups validations")
# It is possible that agent cgroup can be disabled and reset the quotas if the extension failed to start using systemd-run. In that case, we should ignore the validation
if check_cgroup_disabled_due_to_systemd_error() and retry_if_false(check_agent_quota_disabled):
log.info("Cgroup is disabled due to systemd error while invoking the extension, ignoring ext cgroups validations")
else:
raise
Loading