diff --git a/Cargo.toml b/Cargo.toml index 1bab342..129e141 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ tokio = { version = "1", features = ["full"] } tokio-stream = "0.1" async-stream = "0.3" udev = "0.8" -rustix = { version = "0.38", features = ["fs", "stdio", "process", "thread", "pipe"] } +rustix = { version = "0.38", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] } bitflags = "2" once_cell = "1" humantime = "2" diff --git a/shell.nix b/shell.nix index da722b3..b372412 100644 --- a/shell.nix +++ b/shell.nix @@ -8,5 +8,8 @@ pkgs.mkShell { # For llvm-objdump llvmPackages.bintools + + # To aid testing + runc ]; } diff --git a/src/cgroup.rs b/src/cgroup.rs index de219f8..8c940e9 100644 --- a/src/cgroup.rs +++ b/src/cgroup.rs @@ -7,8 +7,8 @@ use std::mem::ManuallyDrop; use std::path::{Path, PathBuf}; // The numerical representation below needs to match BPF_DEVCG constants. -#[allow(unused)] #[repr(u32)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DeviceType { Block = 1, Character = 2, diff --git a/src/dev/device.rs b/src/dev/device.rs index 256d533..d8350fa 100644 --- a/src/dev/device.rs +++ b/src/dev/device.rs @@ -1,9 +1,12 @@ use std::fmt::{self, Display, Formatter}; use std::path::{Path, PathBuf}; +use crate::cgroup::DeviceType; + #[derive(Debug, Clone)] pub struct DevNode { pub path: PathBuf, + pub ty: DeviceType, pub devnum: (u32, u32), } @@ -16,17 +19,22 @@ pub struct Device { impl Device { pub fn from_udev(device: udev::Device) -> Self { - let devnode = device - .devnum() - .zip(device.devnode()) - .map(|(devnum, devnode)| { - let major = rustix::fs::major(devnum); - let minor = rustix::fs::minor(devnum); - DevNode { - path: devnode.to_owned(), - devnum: (major, minor), - } - }); + let devnode = device.devnode().and_then(|devnode| { + let devnum = device.devnum()?; + let major = rustix::fs::major(devnum); + let minor = rustix::fs::minor(devnum); + // Only block subsystem produce block device, everything else are character device. + let ty = if device.subsystem()? == "block" { + DeviceType::Block + } else { + DeviceType::Character + }; + Some(DevNode { + path: devnode.to_owned(), + ty, + devnum: (major, minor), + }) + }); Self { device, devnode } } diff --git a/src/hotplug/mod.rs b/src/hotplug/mod.rs index 3c03ac3..048f94e 100644 --- a/src/hotplug/mod.rs +++ b/src/hotplug/mod.rs @@ -70,8 +70,12 @@ impl HotPlug { .filter_map(|dev| dev.matches(&device)) .collect(); - self.container.device(devnode.devnum, Access::all()).await?; - self.container.mknod(&devnode.path, devnode.devnum).await?; + self.container + .device(devnode.ty, devnode.devnum, Access::all()) + .await?; + self.container + .mknod(&devnode.path, devnode.ty, devnode.devnum) + .await?; for symlink in &symlinks { self.container.symlink(&devnode.path, symlink).await?; } @@ -89,7 +93,7 @@ impl HotPlug { let devnode = device.devnode().unwrap(); self.container - .device(devnode.devnum, Access::empty()) + .device(devnode.ty, devnode.devnum, Access::empty()) .await?; self.container.rm(&devnode.path).await?; for symlink in &device.symlinks { diff --git a/src/runc/container.rs b/src/runc/container.rs index 697a8b6..4a0e486 100644 --- a/src/runc/container.rs +++ b/src/runc/container.rs @@ -1,9 +1,12 @@ -use std::fs::File; +use std::fs::{File, Permissions}; use std::io::{BufRead, BufReader, Seek}; +use std::os::fd::AsFd; +use std::os::unix::fs::{FileTypeExt, MetadataExt, PermissionsExt}; use std::path::Path; -use anyhow::{bail, ensure, Context, Result}; -use rustix::fs::{FileType, Gid, Mode, Uid}; +use anyhow::{bail, Context, Result}; +use rustix::fs::{FileType, Mode, UnmountFlags}; +use rustix::mount::{FsMountFlags, FsOpenFlags, MountAttrFlags, MoveMountFlags}; use rustix::process::{Pid, Signal}; use tokio::io::unix::AsyncFd; use tokio::io::Interest; @@ -63,8 +66,10 @@ impl CgroupEventNotifier { } pub struct Container { - uid: Uid, - gid: Gid, + // Uid and gid of the primary container user. + // Note that they're inside the user namespace (if any). + uid: u32, + gid: u32, pid: Pid, wait: tokio::sync::watch::Receiver, cgroup_device_filter: Mutex>, @@ -87,15 +92,113 @@ impl Container { Box::new(DeviceAccessControllerV2::new(&state.cgroup_paths.unified)?) }; - ensure!(config.process.user.uid != u32::MAX && config.process.user.gid != u32::MAX); - - Ok(Self { - uid: unsafe { Uid::from_raw(config.process.user.uid) }, - gid: unsafe { Gid::from_raw(config.process.user.gid) }, + let container = Self { + uid: config.process.user.uid, + gid: config.process.user.gid, pid: Pid::from_raw(state.init_process_pid.try_into()?).context("Invalid PID")?, wait: recv, cgroup_device_filter: Mutex::new(cgroup_device_filter), - }) + }; + + container.remount_dev()?; + + Ok(container) + } + + /// Remount /dev inside the init namespace. + /// + /// When user namespace is used, the /dev created by runc will be mounted inside the user namespace, + /// and will automatically gain SB_I_NODEV flag as a kernel security measure. + /// + /// This is doing no favour for us because that flag will cause device node within it to be unopenable. + fn remount_dev(&self) -> Result<()> { + let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?; + if !ns.in_user_ns() { + return Ok(()); + } + + log::info!("Remount /dev to allow device node access"); + + // Create a tmpfs and mount in the init namespace. + // Note that while we have "mounted" it, it is not associated with any mount point yet. + // The actual mounting will happen after we moved into the mount namespace. + let dev_fs = rustix::mount::fsopen("tmpfs", FsOpenFlags::empty())?; + rustix::mount::fsconfig_create(dev_fs.as_fd())?; + let dev_mnt = rustix::mount::fsmount( + dev_fs.as_fd(), + FsMountFlags::FSMOUNT_CLOEXEC, + MountAttrFlags::empty(), + )?; + + ns.enter(|| -> Result<_> { + // Don't interfere us setting the desired mode! + rustix::process::umask(Mode::empty()); + + // Move the existing mount elsewhere. + std::fs::create_dir("/olddev")?; + rustix::mount::mount_move("/dev", "/olddev")?; + + // Move to our newly created `/dev` mount. + rustix::mount::move_mount( + dev_mnt.as_fd(), + "", + rustix::fs::CWD, + "/dev", + MoveMountFlags::MOVE_MOUNT_F_EMPTY_PATH, + )?; + + // Make sure the /dev is now owned by the container root not host root. + std::os::unix::fs::chown("/dev", Some(ns.uid(0)?), Some(ns.gid(0)?))?; + std::fs::set_permissions("/dev", Permissions::from_mode(0o755))?; + + for file in std::fs::read_dir("/olddev")? { + let file = file?; + let metadata = file.metadata()?; + let new_path = Path::new("/dev").join(file.file_name()); + + if file.file_name() == "console" { + // `console` is special, it's a file but it should be bind-mounted. + drop( + std::fs::OpenOptions::new() + .create(true) + .write(true) + .open(&new_path)?, + ); + rustix::mount::mount_move(file.path(), new_path)?; + } else if metadata.file_type().is_dir() { + // This is a mount point, e.g. pts, mqueue, shm. + std::fs::create_dir(&new_path)?; + rustix::mount::mount_move(file.path(), new_path)?; + } else if metadata.file_type().is_symlink() { + // Recreate symlinks + let target = std::fs::read_link(file.path())?; + std::os::unix::fs::symlink(target, new_path)?; + } else if metadata.file_type().is_char_device() { + // Recreate device + let dev = metadata.rdev(); + rustix::fs::mknodat( + rustix::fs::CWD, + &new_path, + FileType::CharacterDevice, + Mode::from_raw_mode(metadata.mode()), + dev, + )?; + + // The old file might be a bind mount. Try umount it. + let _ = rustix::mount::unmount(file.path(), UnmountFlags::DETACH); + } else { + anyhow::bail!("Unknown file present in /dev"); + } + } + + // Now we have moved everything to the new /dev, obliterate the old one. + rustix::mount::unmount("/olddev", UnmountFlags::DETACH)?; + std::fs::remove_dir("/olddev")?; + + Ok(()) + })??; + + Ok(()) } pub async fn kill(&self, signal: Signal) -> Result<()> { @@ -112,8 +215,14 @@ impl Container { Ok(()) } - pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> { - crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| { + pub async fn mknod( + &self, + node: &Path, + ty: DeviceType, + (major, minor): (u32, u32), + ) -> Result<()> { + let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?; + ns.enter(|| { if let Some(parent) = node.parent() { let _ = std::fs::create_dir_all(parent); } @@ -121,13 +230,15 @@ impl Container { rustix::fs::mknodat( rustix::fs::CWD, node, - FileType::CharacterDevice, + if ty == DeviceType::Character { + FileType::CharacterDevice + } else { + FileType::BlockDevice + }, Mode::from(0o644), rustix::fs::makedev(major, minor), )?; - if !self.uid.is_root() { - rustix::fs::chown(node, Some(self.uid), Some(self.gid))?; - } + std::os::unix::fs::chown(node, Some(ns.uid(self.uid)?), Some(ns.gid(self.gid)?))?; Ok(()) })? } @@ -150,13 +261,16 @@ impl Container { }) } - pub async fn device(&self, (major, minor): (u32, u32), access: Access) -> Result<()> { - self.cgroup_device_filter.lock().await.set_permission( - DeviceType::Character, - major, - minor, - access, - )?; + pub async fn device( + &self, + ty: DeviceType, + (major, minor): (u32, u32), + access: Access, + ) -> Result<()> { + self.cgroup_device_filter + .lock() + .await + .set_permission(ty, major, minor, access)?; Ok(()) } } diff --git a/src/util/namespace.rs b/src/util/namespace.rs index ab26164..4998238 100644 --- a/src/util/namespace.rs +++ b/src/util/namespace.rs @@ -1,20 +1,75 @@ use std::fs::File; use std::os::fd::AsFd; +use std::path::Path; -use anyhow::Result; +use anyhow::{Context, Result}; +use rustix::fs::{Gid, Uid}; use rustix::process::Pid; -use rustix::thread::{LinkNameSpaceType, UnshareFlags}; +use rustix::thread::{CapabilitiesSecureBits, LinkNameSpaceType, UnshareFlags}; + +pub struct IdMap { + map: Vec<(u32, u32, u32)>, +} + +impl IdMap { + fn read(path: &Path) -> Result { + Self::parse(&std::fs::read_to_string(path)?) + } + + fn parse(content: &str) -> Result { + let mut map = Vec::new(); + for line in content.lines() { + let mut words = line.split_ascii_whitespace(); + let inside = words.next().context("unexpected id_map")?.parse()?; + let outside = words.next().context("unexpected id_map")?.parse()?; + let count = words.next().context("unexpected id_map")?.parse()?; + map.push((inside, outside, count)); + } + Ok(Self { map }) + } + + fn translate(&self, id: u32) -> Option { + for &(inside, outside, count) in self.map.iter() { + if (inside..inside.checked_add(count)?).contains(&id) { + return (id - inside).checked_add(outside); + } + } + None + } +} pub struct MntNamespace { - fd: File, + mnt_fd: File, + uid_map: IdMap, + gid_map: IdMap, } impl MntNamespace { /// Open the mount namespace of a process. pub fn of_pid(pid: Pid) -> Result { - let path = format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()); - let fd = File::open(path)?; - Ok(MntNamespace { fd }) + let mnt_fd = File::open(format!("/proc/{}/ns/mnt", pid.as_raw_nonzero()))?; + let uid_map = IdMap::read(format!("/proc/{}/uid_map", pid.as_raw_nonzero()).as_ref())?; + let gid_map = IdMap::read(format!("/proc/{}/gid_map", pid.as_raw_nonzero()).as_ref())?; + Ok(MntNamespace { + mnt_fd, + uid_map, + gid_map, + }) + } + + /// Check if we're in an user namespace. + pub fn in_user_ns(&self) -> bool { + !(self.uid_map.map == &[(0, 0, u32::MAX)] && self.gid_map.map == &[(0, 0, u32::MAX)]) + } + + /// Translate user ID into a UID in the namespace. + pub fn uid(&self, uid: u32) -> Result { + Ok(self.uid_map.translate(uid).context("UID overflows")?) + } + + /// Translate group ID into a GID in the namespace. + pub fn gid(&self, gid: u32) -> Result { + Ok(self.gid_map.translate(gid).context("GID overflows")?) } /// Enter the mount namespace. @@ -30,9 +85,35 @@ impl MntNamespace { // Switch this particular thread to the container's mount namespace. rustix::thread::move_into_link_name_space( - self.fd.as_fd(), + self.mnt_fd.as_fd(), Some(LinkNameSpaceType::Mount), )?; + + // If user namespace is used, we must act like the root user *inside* + // namespace to be able to create files properly (otherwise EOVERFLOW + // will be returned when creating file). + // + // Entering the user namespace turns out to be problematic. + // The reason seems to be this line [1]: + // which means `CAP_MKNOD` capability of the *init* namespace is needed. + // However task's associated security context is all relative to its current + // user namespace [2], so once you enter a user namespace there's no way of getting + // back `CAP_MKNOD` of the init namespace anymore. + // (Yes this means that even if CAP_MKNOD is granted to the container, you cannot + // create device nodes within it.) + // + // [1]: https://elixir.bootlin.com/linux/v6.11.1/source/fs/namei.c#L4073 + // [2]: https://elixir.bootlin.com/linux/v6.11.1/source/include/linux/cred.h#L111 + + // By default `setuid` will drop capabilities when transitioning from root + // to non-root user. This bit prevents it so our code still have superpower. + rustix::thread::set_capabilities_secure_bits( + CapabilitiesSecureBits::NO_SETUID_FIXUP, + )?; + + rustix::thread::set_thread_uid(unsafe { Uid::from_raw(self.uid(0)?) })?; + rustix::thread::set_thread_gid(unsafe { Gid::from_raw(self.gid(0)?) })?; + Ok(f()) }) .join()