Skip to content

Commit

Permalink
test: Fix restore_dir() unit restart race
Browse files Browse the repository at this point in the history
firewalld writes its configuration files on shutdown. Previously, the
test cleanup first restored the files and *then* restarted firewalld,
which sometimes leaked changes from the test into the persistent config.
That led to subsequent tests timing out to load the login page, as the
`cockpit` service was missing in firewalld.

This is a potential race condition for all service restarts, so fix it
in a general way: Introduce a new `restart_unit` argument to
`restore_dir()`, which first stops the unit, then restores the files,
and then starts the unit again if it was running before. Convert all
affected tests to the new API.

Cherry-picked from main commit ce07ea7
  • Loading branch information
martinpitt committed Sep 21, 2023
1 parent 1b0f5ef commit dd85c0f
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 17 deletions.
2 changes: 1 addition & 1 deletion test/common/netlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def cleanupDevs():
self.machine.execute(f"for d in {' '.join(new)}; do nmcli dev del $d; done")

self.orig_devs = devs()
self.restore_dir("/etc/NetworkManager", post_restore_action="systemctl try-restart NetworkManager")
self.restore_dir("/etc/NetworkManager", restart_unit="NetworkManager")
self.restore_dir("/etc/sysconfig/network-scripts")
self.addCleanup(cleanupDevs)

Expand Down
31 changes: 22 additions & 9 deletions test/common/testlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2016,13 +2016,15 @@ def file_exists(self, path: str) -> bool:

return self.machine.execute(f"if test -e {path}; then echo yes; fi").strip() != ""

def restore_dir(self, path: str, post_restore_action: Optional[str] = None, reboot_safe: bool = False):
def restore_dir(self, path: str, post_restore_action: Optional[str] = None, reboot_safe: bool = False,
restart_unit: Optional[str] = None):
"""Backup/restore a directory for a nondestructive test
This takes care to not ever touch the original content on disk, but uses transient overlays.
As this uses a bind mount, it does not work for files that get changed atomically (with mv);
use restore_file() for these.
`restart_unit` will be stopped before restoring path, and restarted afterwards if it was running.
The optional post_restore_action will run after restoring the original content.
If the directory needs to survive reboot, `reboot_safe=True` needs to be specified; then this
Expand All @@ -2031,31 +2033,42 @@ def restore_dir(self, path: str, post_restore_action: Optional[str] = None, rebo
if not self.is_nondestructive() and not self.machine.ostree_image:
return # skip for efficiency reasons

exe = self.machine.execute

if not self.file_exists(path):
self.addCleanup(self.machine.execute, "rm -rf {0}".format(path))
self.addCleanup(exe, f"rm -rf '{path}'")
return

backup = os.path.join(self.vm_tmpdir, path.replace('/', '_'))
self.machine.execute("mkdir -p %(vm_tmpdir)s; cp -a %(path)s/ %(backup)s/" % {
"vm_tmpdir": self.vm_tmpdir, "path": path, "backup": backup})
exe(f"mkdir -p {self.vm_tmpdir}; cp -a {path}/ {backup}/")

if not reboot_safe:
self.machine.execute("mount -o bind %(backup)s %(path)s" % {
"path": path, "backup": backup})
exe(f"mount -o bind {backup} {path}")

if restart_unit:
restart_stamp = f"/run/cockpit_restart_{restart_unit}"
self.addCleanup(
exe,
f"if [ -e {restart_stamp} ]; then systemctl start {restart_unit}; rm {restart_stamp}; fi"
)

if post_restore_action:
self.addCleanup(self.machine.execute, post_restore_action)
self.addCleanup(exe, post_restore_action)

if reboot_safe:
self.addCleanup(self.machine.execute, f"rm -rf {path}; mv {backup} {path}")
self.addCleanup(exe, f"rm -rf {path}; mv {backup} {path}")
else:
# HACK: a lot of tests call this on /home/...; that restoration happens before killing all user
# processes in nonDestructiveSetup(), so we have to do it lazily
if path.startswith("/home"):
cmd = f"umount -lf {path}"
else:
cmd = f"umount {path} || {{ fuser -uvk {path} {path}/* >&2 || true; sleep 1; umount {path}; }}"
self.addCleanup(self.machine.execute, cmd)
self.addCleanup(exe, cmd)

if restart_unit:
self.addCleanup(exe, f"if systemctl --quiet is-active {restart_unit}; then touch {restart_stamp}; fi; "
f"systemctl stop {restart_unit}")

def restore_file(self, path: str, post_restore_action: Optional[str] = None):
"""Backup/restore a file for a nondestructive test
Expand Down
2 changes: 1 addition & 1 deletion test/verify/check-loopback
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class TestLoopback(testlib.MachineCase):
self.assertIn("no-cockpit", b.text('#login-fatal'))

m.disconnect()
self.restore_dir("/etc/ssh", post_restore_action="systemctl try-restart sshd.service")
self.restore_dir("/etc/ssh", restart_unit="sshd.service")
m.execute("sed -i 's/.*PasswordAuthentication.*/PasswordAuthentication no/' /etc/ssh/sshd_config $(ls /etc/ssh/sshd_config.d/* 2>/dev/null || true)")
m.execute("systemctl try-restart sshd.service")
m.wait_execute()
Expand Down
3 changes: 2 additions & 1 deletion test/verify/check-networkmanager-firewall
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class TestFirewall(netlib.NetworkCase):
def setUp(self):
super().setUp()
m = self.machine
self.restore_dir("/etc/firewalld", "systemctl try-restart firewalld")
self.restore_dir("/etc/firewalld", restart_unit="firewalld")

m.execute("systemctl restart firewalld")
self.btn_danger = "pf-m-danger"
self.btn_primary = "pf-m-primary"
Expand Down
6 changes: 3 additions & 3 deletions test/verify/check-static-login
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group

# On OSTree this happens over ssh
if m.ostree_image:
self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service')
self.restore_dir("/etc/ssh", restart_unit="sshd")
m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*")
m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service")

Expand Down Expand Up @@ -361,7 +361,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group
conf = "/etc/pam.d/cockpit"
if m.ostree_image:
conf = "/etc/pam.d/sshd"
self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service')
self.restore_dir("/etc/ssh", restart_unit="sshd")
m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*")
m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service")

Expand Down Expand Up @@ -762,7 +762,7 @@ account required pam_succeed_if.so user ingroup %s""" % m.get_admin_group
# also, sssd is split into several services
self.restore_dir("/etc/sssd", post_restore_action="systemctl stop 'sssd*'")
else:
self.restore_dir("/etc/sssd", post_restore_action="systemctl try-restart sssd")
self.restore_dir("/etc/sssd", restart_unit="sssd")

m.execute("useradd alice; echo alice:foobar123 | chpasswd")
m.upload(["alice.pem", "alice.key", "alice-expired.pem", "bob.pem", "bob.key"], self.vm_tmpdir,
Expand Down
4 changes: 2 additions & 2 deletions test/verify/check-users
Original file line number Diff line number Diff line change
Expand Up @@ -645,9 +645,9 @@ class TestAccounts(testlib.MachineCase):

# On OSTree this happens over ssh
if m.ostree_image:
self.restore_dir("/etc/ssh", '( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service')
self.restore_dir("/etc/ssh", restart_unit="sshd")
m.execute("sed -i 's/.*ChallengeResponseAuthentication.*/ChallengeResponseAuthentication yes/' /etc/ssh/sshd_config /etc/ssh/sshd_config.d/*")
m.execute("( ! systemctl is-active sshd.socket || systemctl stop sshd.socket) && systemctl restart sshd.service")
m.execute("systemctl try-restart sshd.service")

b.wait_visible("#login")
b.wait_not_visible("#conversation-group")
Expand Down

0 comments on commit dd85c0f

Please sign in to comment.