From c5a571304ee83266796134819057dac44032b903 Mon Sep 17 00:00:00 2001 From: Dan Bungert Date: Mon, 25 Sep 2023 16:41:30 -0600 Subject: [PATCH] filesystem: revamp udev handling In LP: #2009141, we are hitting kernel limits and pyudev buffer limits. We don't care about specific events, so much as getting one event, waiting for things to calm down, then reprobing. Outright disable the event monitor, and re-enable later. If there is a storm of events, testing has shown that stopping the listener is not enough. --- subiquity/server/controllers/filesystem.py | 54 ++++++++++++++-------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/subiquity/server/controllers/filesystem.py b/subiquity/server/controllers/filesystem.py index eb40edede0..e6aeced7f7 100644 --- a/subiquity/server/controllers/filesystem.py +++ b/subiquity/server/controllers/filesystem.py @@ -20,7 +20,6 @@ import logging import os import pathlib -import select import time from typing import Any, Callable, Dict, List, Optional, Union @@ -273,6 +272,7 @@ def __init__(self, app): self._system_mounter: Optional[Mounter] = None self._role_to_device: Dict[str, _Device] = {} self._device_to_structure: Dict[_Device, snapdapi.OnVolume] = {} + self._pyudev_context: Optional[pyudev.Context] = None self.use_tpm: bool = False self.locked_probe_data: bool = False # If probe data come in while we are doing partitioning, store it in @@ -299,7 +299,7 @@ async def configured(self): ): self.app.base_model.source.search_drivers = not self.is_core_boot_classic() await super().configured() - self.stop_listening_udev() + self.stop_monitor() async def _mount_systems_dir(self, variation_name): self._source_handler = self.app.controllers.Source.get_handler(variation_name) @@ -1294,6 +1294,13 @@ async def _probe(self, *, context=None): elapsed = time.time() - start log.debug(f"{short_label} probing took {elapsed:.1f} seconds") break + # The start_monitor call is down here, purposefully after the probe is + # complete and not intended to be run after a probing timeout. The + # reason for the monitor is to decide when to re-probe, so there is no + # need to monitor for events before the first probe has even happened, + # and in the current implementation we have no plan forward if a probe + # timeout has happened, so doing it again is unlikely to help. + self.start_monitor() async def run_autoinstall_guided(self, layout): name = layout["name"] @@ -1456,21 +1463,31 @@ def start(self): self._start_task = schedule_task(self._start()) async def _start(self): - context = pyudev.Context() - self._monitor = pyudev.Monitor.from_netlink(context) - self._monitor.filter_by(subsystem="block") - self._monitor.enable_receiving() - self.start_listening_udev() await self._probe_task.start() - def start_listening_udev(self): + def start_monitor(self): + if self._configured: + return + + log.debug("start_monitor") + if self._pyudev_context is None: + self._pyudev_context = pyudev.Context() + self._monitor = pyudev.Monitor.from_netlink(self._pyudev_context) + self._monitor.filter_by(subsystem="block") + self._monitor.start() loop = asyncio.get_running_loop() loop.add_reader(self._monitor.fileno(), self._udev_event) - def stop_listening_udev(self): + def stop_monitor(self): + if self._monitor is None: + return + + log.debug("stop_monitor") loop = asyncio.get_running_loop() loop.remove_reader(self._monitor.fileno()) + self._monitor = None + def ensure_probing(self): try: self._probe_task.start_sync() @@ -1480,21 +1497,20 @@ def ensure_probing(self): log.debug("Triggered Probert run on udev event") def _udev_event(self): + # We outright stop monitoring because we're not super concerned about + # the specifics of the udev event, only that one happened and that when + # the events settle, we want to reprobe. This is significantly faster + # than keeping a monitor around and draining the event queue. + # LP: #2009141 + self.stop_monitor() + cp = run_command(["udevadm", "settle", "-t", "0"]) + if cp.returncode != 0: log.debug("waiting 0.1 to let udev event queue settle") - self.stop_listening_udev() loop = asyncio.get_running_loop() - loop.call_later(0.1, self.start_listening_udev) + loop.call_later(0.1, self._udev_event) return - # Drain the udev events in the queue -- if we stopped listening to - # allow udev to settle, it's good bet there is more than one event to - # process and we don't want to kick off a full block probe for each - # one. It's a touch unfortunate that pyudev doesn't have a - # non-blocking read so we resort to select(). - while select.select([self._monitor.fileno()], [], [], 0)[0]: - action, dev = self._monitor.receive_device() - log.debug("_udev_event %s %s", action, dev) self.ensure_probing() def make_autoinstall(self):