diff --git a/Cargo.lock b/Cargo.lock index d7e7ac2..e633e99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,6 +116,12 @@ dependencies = [ "backtrace", ] +[[package]] +name = "assert_matches" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" + [[package]] name = "async-stream" version = "0.3.5" @@ -144,6 +150,35 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "aya" +version = "0.12.0" +source = "git+https://github.com/nbdd0121/aya.git#764b6d58b2570f43b908cb02c06c7333b3567009" +dependencies = [ + "assert_matches", + "aya-obj", + "bitflags 2.4.2", + "bytes", + "lazy_static", + "libc", + "log", + "object", + "thiserror", +] + +[[package]] +name = "aya-obj" +version = "0.1.0" +source = "git+https://github.com/nbdd0121/aya.git#764b6d58b2570f43b908cb02c06c7333b3567009" +dependencies = [ + "bytes", + "core-error", + "hashbrown 0.14.3", + "log", + "object", + "thiserror", +] + [[package]] name = "backtrace" version = "0.3.69" @@ -323,6 +358,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", + "aya", "bitflags 2.4.2", "bollard", "bytes", @@ -342,6 +378,15 @@ dependencies = [ "walkdir", ] +[[package]] +name = "core-error" +version = "0.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efcdb2972eb64230b4c50646d8498ff73f5128d196a90c7236eec4cbe8619b8f" +dependencies = [ + "version_check", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -737,6 +782,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.153" diff --git a/Cargo.toml b/Cargo.toml index 455b114..e96b842 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ bollard = "0.16" futures = "0.3" rustix = { version = "0.38", features = ["fs", "stdio", "termios"] } bitflags = "2" +aya = { git = "https://github.com/nbdd0121/aya.git" } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] } diff --git a/README.md b/README.md index 661b085..03baa33 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,7 @@ Another concern is providing a container with well known paths for the devices. On bare-metal systems this would usually be achieved with a `SYMLINK` directive in a udev rule. This program tries to provide a similar functionality for containers, allowing you to specify symlinks for certain devices. -## Limitations - -`container-hotplug` needs to be run as root and relies on `cgroup v1`. It does not support `cgroup v2`. -On distributions with `cgroup v2`, you can switch back to `cgroup v1` by setting the [kernel parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters) `systemd.unified_cgroup_hierarchy=0`. +This tool supports both cgroup v1 and v2. ## Example diff --git a/src/docker/cgroup.rs b/src/docker/cgroup.rs index 026d6ae..bf8580a 100644 --- a/src/docker/cgroup.rs +++ b/src/docker/cgroup.rs @@ -1,4 +1,8 @@ -use anyhow::{ensure, Result}; +use anyhow::{ensure, Context, Result}; +use aya::maps::{HashMap, MapData}; +use aya::programs::{CgroupDevice, Link}; +use std::fs::File; +use std::mem::ManuallyDrop; use std::path::PathBuf; // The numerical representation below needs to match BPF_DEVCG constants. @@ -26,6 +30,10 @@ pub trait DeviceAccessController { minor: u32, access: Access, ) -> Result<()>; + + /// Stop performing access control. This may allow all accesses, so should only be used when + /// the cgroup is shutdown. + fn stop(self: Box) -> Result<()>; } pub struct DeviceAccessControllerV1 { @@ -96,4 +104,95 @@ impl DeviceAccessController for DeviceAccessControllerV1 { Ok(()) } + + fn stop(self: Box) -> Result<()> { + Ok(()) + } +} + +#[allow(unused)] // This is read as POD by the BPF program. +#[derive(Clone, Copy)] +struct Device { + device_type: u32, + major: u32, + minor: u32, +} + +// SAFETY: Device is `repr(C)`` and has no padding. +unsafe impl aya::Pod for Device {} + +pub struct DeviceAccessControllerV2 { + map: HashMap, + pin: PathBuf, +} + +impl DeviceAccessControllerV2 { + pub fn new(id: &str) -> Result { + // We want to take control of the device cgroup filtering from docker. To do this, we attach our own + // filter program and detach the one by docker. + let cgroup_path = format!("/sys/fs/cgroup/system.slice/docker-{id}.scope"); + let cgroup = File::open(cgroup_path)?; + + let mut bpf = aya::Bpf::load(include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/cgroup_device_filter/target/bpfel-unknown-none/release/cgroup_device_filter" + )))?; + + let program: &mut CgroupDevice = bpf + .program_mut("check_device") + .context("cannot find check_device program")? + .try_into()?; + + program.load()?; + + // Iterate existing programs. We'll need to detach them later. + // Wrap this inside `ManuallyDrop` to prevent accidental detaching. + let existing_programs = ManuallyDrop::new(CgroupDevice::query(&cgroup)?); + + program.attach(&cgroup)?; + + // Pin the program so that if container-hotplug accidentally exits, the filter won't be removed from the docker + // container. + let pin: PathBuf = format!("/sys/fs/bpf/docker-{id}-device-filter").into(); + program.pin(&pin)?; + + // Now our new filter is attached, detach all docker filters. + for existing_program in ManuallyDrop::into_inner(existing_programs) { + existing_program.detach()?; + } + + let map: HashMap<_, Device, u32> = bpf + .take_map("DEVICE_PERM") + .context("cannot find DEVICE_PERM map")? + .try_into()?; + + Ok(Self { map, pin }) + } +} + +impl DeviceAccessController for DeviceAccessControllerV2 { + fn set_permission( + &mut self, + ty: DeviceType, + major: u32, + minor: u32, + access: Access, + ) -> Result<()> { + let device = Device { + device_type: ty as u32, + major, + minor, + }; + if access.is_empty() { + self.map.remove(&device)?; + } else { + self.map.insert(device, access.bits(), 0)?; + } + Ok(()) + } + + fn stop(self: Box) -> Result<()> { + CgroupDevice::from_pin(&self.pin)?.unpin()?; + Ok(()) + } } diff --git a/src/docker/container.rs b/src/docker/container.rs index e7a9525..7821c95 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -10,7 +10,7 @@ use tokio::signal::unix::{signal, SignalKind}; use tokio::task::{spawn, JoinHandle}; use tokio_stream::StreamExt; -use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceType}; +use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceAccessControllerV2, DeviceType}; use super::{IoStream, IoStreamSource}; #[derive(Clone)] @@ -18,7 +18,7 @@ pub struct Container { id: String, docker: bollard::Docker, remove_event: Shared>>, - cgroup_device_filter: Arc>>, + cgroup_device_filter: Arc>>>, } impl Container { @@ -40,13 +40,19 @@ impl Container { .shared(); let cgroup_device_filter: Box = - Box::new(DeviceAccessControllerV1::new(id)?); + match DeviceAccessControllerV2::new(id) { + Ok(v) => Box::new(v), + Err(err) => match DeviceAccessControllerV1::new(id) { + Ok(v) => Box::new(v), + Err(_) => Err(err).context("neither cgroup v1 and cgroup v2 works")?, + }, + }; Ok(Self { id: id.to_owned(), docker: docker.clone(), remove_event: remove_evevnt, - cgroup_device_filter: Arc::new(Mutex::new(cgroup_device_filter)), + cgroup_device_filter: Arc::new(Mutex::new(Some(cgroup_device_filter))), }) } @@ -83,6 +89,14 @@ impl Container { .context("no destroy event")?; } + // Stop the cgroup device filter. Only do so once we're sure that the container is removed. + self.cgroup_device_filter + .lock() + .unwrap() + .take() + .unwrap() + .stop()?; + Ok(()) } @@ -229,7 +243,7 @@ impl Container { let controller = self.cgroup_device_filter.clone(); tokio::task::spawn_blocking(move || -> Result<()> { let mut controller = controller.lock().unwrap(); - controller.set_permission( + controller.as_mut().unwrap().set_permission( DeviceType::Character, major, minor, diff --git a/src/main.rs b/src/main.rs index b0418d8..0bf6372 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,11 +7,11 @@ use cli::{Action, Device, Symlink}; use docker::{Container, Docker}; use hotplug::{Event as HotPlugEvent, HotPlug, PluggedDevice}; +use std::fmt::Display; use std::pin::pin; -use std::{fmt::Display, path::Path}; use tokio_stream::StreamExt; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use clap::Parser; use clap_verbosity_flag::{InfoLevel, LogLevel, Verbosity}; use log::info; @@ -98,10 +98,6 @@ fn run_hotplug( async fn run(param: cli::Run, verbosity: Verbosity) -> Result { let mut status = 0; - if !Path::new("/sys/fs/cgroup/devices/").is_dir() { - bail!("Could not find cgroup v1"); - } - let docker = Docker::connect_with_defaults()?; let container = docker.run(param.docker_args).await?; drop(container.pipe_signals());