Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support containers launched with user namespace #7

Merged
merged 3 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
async-stream = "0.3"
udev = "0.8"
rustix = { version = "0.38", features = ["fs", "stdio", "process", "thread", "pipe"] }
rustix = { version = "0.38", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] }
bitflags = "2"
once_cell = "1"
humantime = "2"
Expand Down
3 changes: 3 additions & 0 deletions shell.nix
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,8 @@ pkgs.mkShell {

# For llvm-objdump
llvmPackages.bintools

# To aid testing
runc
];
}
2 changes: 1 addition & 1 deletion src/cgroup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use std::mem::ManuallyDrop;
use std::path::{Path, PathBuf};

// The numerical representation below needs to match BPF_DEVCG constants.
#[allow(unused)]
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DeviceType {
Block = 1,
Character = 2,
Expand Down
30 changes: 19 additions & 11 deletions src/dev/device.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use std::fmt::{self, Display, Formatter};
use std::path::{Path, PathBuf};

use crate::cgroup::DeviceType;

#[derive(Debug, Clone)]
pub struct DevNode {
pub path: PathBuf,
pub ty: DeviceType,
pub devnum: (u32, u32),
}

Expand All @@ -16,17 +19,22 @@ pub struct Device {

impl Device {
pub fn from_udev(device: udev::Device) -> Self {
let devnode = device
.devnum()
.zip(device.devnode())
.map(|(devnum, devnode)| {
let major = rustix::fs::major(devnum);
let minor = rustix::fs::minor(devnum);
DevNode {
path: devnode.to_owned(),
devnum: (major, minor),
}
});
let devnode = device.devnode().and_then(|devnode| {
let devnum = device.devnum()?;
let major = rustix::fs::major(devnum);
let minor = rustix::fs::minor(devnum);
// Only block subsystem produce block device, everything else are character device.
let ty = if device.subsystem()? == "block" {
DeviceType::Block
} else {
DeviceType::Character
};
Some(DevNode {
path: devnode.to_owned(),
ty,
devnum: (major, minor),
})
});
Self { device, devnode }
}

Expand Down
10 changes: 7 additions & 3 deletions src/hotplug/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,12 @@ impl HotPlug {
.filter_map(|dev| dev.matches(&device))
.collect();

self.container.device(devnode.devnum, Access::all()).await?;
self.container.mknod(&devnode.path, devnode.devnum).await?;
self.container
.device(devnode.ty, devnode.devnum, Access::all())
.await?;
self.container
.mknod(&devnode.path, devnode.ty, devnode.devnum)
.await?;
for symlink in &symlinks {
self.container.symlink(&devnode.path, symlink).await?;
}
Expand All @@ -89,7 +93,7 @@ impl HotPlug {

let devnode = device.devnode().unwrap();
self.container
.device(devnode.devnum, Access::empty())
.device(devnode.ty, devnode.devnum, Access::empty())
.await?;
self.container.rm(&devnode.path).await?;
for symlink in &device.symlinks {
Expand Down
162 changes: 138 additions & 24 deletions src/runc/container.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use std::fs::File;
use std::fs::{File, Permissions};
use std::io::{BufRead, BufReader, Seek};
use std::os::fd::AsFd;
use std::os::unix::fs::{FileTypeExt, MetadataExt, PermissionsExt};
use std::path::Path;

use anyhow::{bail, ensure, Context, Result};
use rustix::fs::{FileType, Gid, Mode, Uid};
use anyhow::{bail, Context, Result};
use rustix::fs::{FileType, Mode, UnmountFlags};
use rustix::mount::{FsMountFlags, FsOpenFlags, MountAttrFlags, MoveMountFlags};
use rustix::process::{Pid, Signal};
use tokio::io::unix::AsyncFd;
use tokio::io::Interest;
Expand Down Expand Up @@ -63,8 +66,10 @@ impl CgroupEventNotifier {
}

pub struct Container {
uid: Uid,
gid: Gid,
// Uid and gid of the primary container user.
// Note that they're inside the user namespace (if any).
uid: u32,
gid: u32,
pid: Pid,
wait: tokio::sync::watch::Receiver<bool>,
cgroup_device_filter: Mutex<Box<dyn DeviceAccessController + Send>>,
Expand All @@ -87,15 +92,113 @@ impl Container {
Box::new(DeviceAccessControllerV2::new(&state.cgroup_paths.unified)?)
};

ensure!(config.process.user.uid != u32::MAX && config.process.user.gid != u32::MAX);

Ok(Self {
uid: unsafe { Uid::from_raw(config.process.user.uid) },
gid: unsafe { Gid::from_raw(config.process.user.gid) },
let container = Self {
uid: config.process.user.uid,
gid: config.process.user.gid,
pid: Pid::from_raw(state.init_process_pid.try_into()?).context("Invalid PID")?,
wait: recv,
cgroup_device_filter: Mutex::new(cgroup_device_filter),
})
};

container.remount_dev()?;

Ok(container)
}

/// Remount /dev inside the init namespace.
///
/// When user namespace is used, the /dev created by runc will be mounted inside the user namespace,
/// and will automatically gain SB_I_NODEV flag as a kernel security measure.
///
/// This is doing no favour for us because that flag will cause device node within it to be unopenable.
fn remount_dev(&self) -> Result<()> {
let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?;
if !ns.in_user_ns() {
return Ok(());
}

log::info!("Remount /dev to allow device node access");

// Create a tmpfs and mount in the init namespace.
// Note that while we have "mounted" it, it is not associated with any mount point yet.
// The actual mounting will happen after we moved into the mount namespace.
let dev_fs = rustix::mount::fsopen("tmpfs", FsOpenFlags::empty())?;
rustix::mount::fsconfig_create(dev_fs.as_fd())?;
let dev_mnt = rustix::mount::fsmount(
dev_fs.as_fd(),
FsMountFlags::FSMOUNT_CLOEXEC,
MountAttrFlags::empty(),
)?;

ns.enter(|| -> Result<_> {
// Don't interfere us setting the desired mode!
rustix::process::umask(Mode::empty());

// Move the existing mount elsewhere.
std::fs::create_dir("/olddev")?;
rustix::mount::mount_move("/dev", "/olddev")?;

// Move to our newly created `/dev` mount.
rustix::mount::move_mount(
dev_mnt.as_fd(),
"",
rustix::fs::CWD,
"/dev",
MoveMountFlags::MOVE_MOUNT_F_EMPTY_PATH,
)?;

// Make sure the /dev is now owned by the container root not host root.
std::os::unix::fs::chown("/dev", Some(ns.uid(0)?), Some(ns.gid(0)?))?;
std::fs::set_permissions("/dev", Permissions::from_mode(0o755))?;

for file in std::fs::read_dir("/olddev")? {
let file = file?;
let metadata = file.metadata()?;
let new_path = Path::new("/dev").join(file.file_name());

if file.file_name() == "console" {
// `console` is special, it's a file but it should be bind-mounted.
drop(
std::fs::OpenOptions::new()
.create(true)
.write(true)
.open(&new_path)?,
);
rustix::mount::mount_move(file.path(), new_path)?;
} else if metadata.file_type().is_dir() {
// This is a mount point, e.g. pts, mqueue, shm.
std::fs::create_dir(&new_path)?;
rustix::mount::mount_move(file.path(), new_path)?;
} else if metadata.file_type().is_symlink() {
// Recreate symlinks
let target = std::fs::read_link(file.path())?;
std::os::unix::fs::symlink(target, new_path)?;
} else if metadata.file_type().is_char_device() {
// Recreate device
let dev = metadata.rdev();
rustix::fs::mknodat(
rustix::fs::CWD,
&new_path,
FileType::CharacterDevice,
Mode::from_raw_mode(metadata.mode()),
dev,
)?;

// The old file might be a bind mount. Try umount it.
let _ = rustix::mount::unmount(file.path(), UnmountFlags::DETACH);
} else {
anyhow::bail!("Unknown file present in /dev");
}
}

// Now we have moved everything to the new /dev, obliterate the old one.
rustix::mount::unmount("/olddev", UnmountFlags::DETACH)?;
std::fs::remove_dir("/olddev")?;

Ok(())
})??;

Ok(())
}

pub async fn kill(&self, signal: Signal) -> Result<()> {
Expand All @@ -112,22 +215,30 @@ impl Container {
Ok(())
}

pub async fn mknod(&self, node: &Path, (major, minor): (u32, u32)) -> Result<()> {
crate::util::namespace::MntNamespace::of_pid(self.pid)?.enter(|| {
pub async fn mknod(
&self,
node: &Path,
ty: DeviceType,
(major, minor): (u32, u32),
) -> Result<()> {
let ns = crate::util::namespace::MntNamespace::of_pid(self.pid)?;
ns.enter(|| {
if let Some(parent) = node.parent() {
let _ = std::fs::create_dir_all(parent);
}
let _ = std::fs::remove_file(node);
rustix::fs::mknodat(
rustix::fs::CWD,
node,
FileType::CharacterDevice,
if ty == DeviceType::Character {
FileType::CharacterDevice
} else {
FileType::BlockDevice
},
Mode::from(0o644),
rustix::fs::makedev(major, minor),
)?;
if !self.uid.is_root() {
rustix::fs::chown(node, Some(self.uid), Some(self.gid))?;
}
std::os::unix::fs::chown(node, Some(ns.uid(self.uid)?), Some(ns.gid(self.gid)?))?;
Ok(())
})?
}
Expand All @@ -150,13 +261,16 @@ impl Container {
})
}

pub async fn device(&self, (major, minor): (u32, u32), access: Access) -> Result<()> {
self.cgroup_device_filter.lock().await.set_permission(
DeviceType::Character,
major,
minor,
access,
)?;
pub async fn device(
&self,
ty: DeviceType,
(major, minor): (u32, u32),
access: Access,
) -> Result<()> {
self.cgroup_device_filter
.lock()
.await
.set_permission(ty, major, minor, access)?;
Ok(())
}
}
Loading
Loading