From dd85c0ffa0cfa90536c96104c5e6b35919770a9e Mon Sep 17 00:00:00 2001 From: Martin Pitt Date: Wed, 20 Sep 2023 09:39:59 +0200 Subject: [PATCH] test: Fix restore_dir() unit restart race firewalld writes its configuration files on shutdown. Previously, the test cleanup first restored the files and *then* restarted firewalld, which sometimes leaked changes from the test into the persistent config. That led to subsequent tests timing out to load the login page, as the `cockpit` service was missing in firewalld. This is a potential race condition for all service restarts, so fix it in a general way: Introduce a new `restart_unit` argument to `restore_dir()`, which first stops the unit, then restores the files, and then starts the unit again if it was running before. Convert all affected tests to the new API. Cherry-picked from main commit ce07ea7372c --- test/common/netlib.py | 2 +- test/common/testlib.py | 31 ++++++++++++++++------- test/verify/check-loopback | 2 +- test/verify/check-networkmanager-firewall | 3 ++- test/verify/check-static-login | 6 ++--- test/verify/check-users | 4 +-- 6 files changed, 31 insertions(+), 17 deletions(-) diff --git a/test/common/netlib.py b/test/common/netlib.py index f5caaf6bb1d..12b02fa63c7 100644 --- a/test/common/netlib.py +++ b/test/common/netlib.py @@ -84,7 +84,7 @@ def cleanupDevs(): self.machine.execute(f"for d in {' '.join(new)}; do nmcli dev del $d; done") self.orig_devs = devs() - self.restore_dir("/etc/NetworkManager", post_restore_action="systemctl try-restart NetworkManager") + self.restore_dir("/etc/NetworkManager", restart_unit="NetworkManager") self.restore_dir("/etc/sysconfig/network-scripts") self.addCleanup(cleanupDevs) diff --git a/test/common/testlib.py b/test/common/testlib.py index f8a08a00a71..7a31afd9f71 100644 --- a/test/common/testlib.py +++ b/test/common/testlib.py @@ -2016,13 +2016,15 @@ def file_exists(self, path: str) -> bool: return self.machine.execute(f"if test -e {path}; then echo yes; fi").strip() != "" - def restore_dir(self, path: str, post_restore_action: Optional[str] = None, reboot_safe: bool = False): + def restore_dir(self, path: str, post_restore_action: Optional[str] = None, reboot_safe: bool = False, + restart_unit: Optional[str] = None): """Backup/restore a directory for a nondestructive test This takes care to not ever touch the original content on disk, but uses transient overlays. As this uses a bind mount, it does not work for files that get changed atomically (with mv); use restore_file() for these. + `restart_unit` will be stopped before restoring path, and restarted afterwards if it was running. The optional post_restore_action will run after restoring the original content. If the directory needs to survive reboot, `reboot_safe=True` needs to be specified; then this @@ -2031,23 +2033,30 @@ def restore_dir(self, path: str, post_restore_action: Optional[str] = None, rebo if not self.is_nondestructive() and not self.machine.ostree_image: return # skip for efficiency reasons + exe = self.machine.execute + if not self.file_exists(path): - self.addCleanup(self.machine.execute, "rm -rf {0}".format(path)) + self.addCleanup(exe, f"rm -rf '{path}'") return backup = os.path.join(self.vm_tmpdir, path.replace('/', '_')) - self.machine.execute("mkdir -p %(vm_tmpdir)s; cp -a %(path)s/ %(backup)s/" % { - "vm_tmpdir": self.vm_tmpdir, "path": path, "backup": backup}) + exe(f"mkdir -p {self.vm_tmpdir}; cp -a {path}/ {backup}/") if not reboot_safe: - self.machine.execute("mount -o bind %(backup)s %(path)s" % { - "path": path, "backup": backup}) + exe(f"mount -o bind {backup} {path}") + + if restart_unit: + restart_stamp = f"/run/cockpit_restart_{restart_unit}" + self.addCleanup( + exe, + f"if [ -e {restart_stamp} ]; then systemctl start {restart_unit}; rm {restart_stamp}; fi" + ) if post_restore_action: - self.addCleanup(self.machine.execute, post_restore_action) + self.addCleanup(exe, post_restore_action) if reboot_safe: - self.addCleanup(self.machine.execute, f"rm -rf {path}; mv {backup} {path}") + self.addCleanup(exe, f"rm -rf {path}; mv {backup} {path}") else: # HACK: a lot of tests call this on /home/...; that restoration happens before killing all user # processes in nonDestructiveSetup(), so we have to do it lazily @@ -2055,7 +2064,11 @@ def restore_dir(self, path: str, post_restore_action: Optional[str] = None, rebo cmd = f"umount -lf {path}" else: cmd = f"umount {path} || {{ fuser -uvk {path} {path}/* >&2 || true; sleep 1; umount {path}; }}" - self.addCleanup(self.machine.execute, cmd) + self.addCleanup(exe, cmd) + + if restart_unit: + self.addCleanup(exe, f"if systemctl --quiet is-active {restart_unit}; then touch {restart_stamp}; fi; " + f"systemctl stop {restart_unit}") def restore_file(self, path: str, post_restore_action: Optional[str] = None): """Backup/restore a file for a nondestructive test diff --git a/test/verify/check-loopback b/test/verify/check-loopback index bb8e98e524a..14219145111 100755 --- a/test/verify/check-loopback +++ b/test/verify/check-loopback @@ -49,7 +49,7 @@ class TestLoopback(testlib.MachineCase): self.assertIn("no-cockpit", b.text('#login-fatal')) m.disconnect() - self.restore_dir("/etc/ssh", post_restore_action="systemctl try-restart sshd.service") + self.restore_dir("/etc/ssh", restart_unit="sshd.service") m.execute("sed -i 's/.*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config $(ls /etc/ssh/sshd_config.d/* 2>/dev/null || true)") m.execute("systemctl try-restart sshd.service") m.wait_execute() diff --git a/test/verify/check-networkmanager-firewall b/test/verify/check-networkmanager-firewall index cce70a55ab5..8630916a905 100755 --- a/test/verify/check-networkmanager-firewall +++ b/test/verify/check-networkmanager-firewall @@ -53,7 +53,8 @@ class TestFirewall(netlib.NetworkCase): def setUp(self): super().setUp() m = self.machine - self.restore_dir("/etc/firewalld", "systemctl try-restart firewalld") + self.restore_dir("/etc/firewalld", restart_unit="firewalld") + m.execute("systemctl restart firewalld") self.btn_danger = "pf-m-danger" self.btn_primary = "pf-m-primary" diff --git a/test/verify/check-static-login b/test/verify/check-static-login index ff370b49438..90abf86e412 100755 --- a/test/verify/check-static-login +++ b/test/verify/check-static-login @@ -262,7 +262,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group # On OSTree this happens over ssh if m.ostree_image: - self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service') + self.restore_dir("/etc/ssh", restart_unit="sshd") m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*") m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service") @@ -361,7 +361,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group conf = "/etc/pam.d/cockpit" if m.ostree_image: conf = "/etc/pam.d/sshd" - self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service') + self.restore_dir("/etc/ssh", restart_unit="sshd") m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*") m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service") @@ -762,7 +762,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group # also, sssd is split into several services self.restore_dir("/etc/sssd", post_restore_action="systemctl stop 'sssd*'") else: - self.restore_dir("/etc/sssd", post_restore_action="systemctl try-restart sssd") + self.restore_dir("/etc/sssd", restart_unit="sssd") m.execute("useradd alice; echo alice:foobar123 | chpasswd") m.upload(["alice.pem", "alice.key", "alice-expired.pem", "bob.pem", "bob.key"], self.vm_tmpdir, diff --git a/test/verify/check-users b/test/verify/check-users index bc72df742ac..d3d243e9502 100755 --- a/test/verify/check-users +++ b/test/verify/check-users @@ -645,9 +645,9 @@ class TestAccounts(testlib.MachineCase): # On OSTree this happens over ssh if m.ostree_image: - self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service') + self.restore_dir("/etc/ssh", restart_unit="sshd") m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*") - m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service") + m.execute("systemctl try-restart sshd.service") b.wait_visible("#login") b.wait_not_visible("#conversation-group")