diff --git a/Cargo.toml b/Cargo.toml index 9ac2e72..822cdeb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ async-trait = "0.1.77" backhand = "0.15.0" byteorder = "1" bytes = "1.5.0" +cgroups-rs = "0.3.4" cli-tables = "0.2.1" crossterm = "0.27.0" ctrlc = "3.4.4" diff --git a/crates/guest/Cargo.toml b/crates/guest/Cargo.toml index b1ab93c..ecb86c6 100644 --- a/crates/guest/Cargo.toml +++ b/crates/guest/Cargo.toml @@ -10,6 +10,7 @@ resolver = "2" [dependencies] anyhow = { workspace = true } +cgroups-rs = { workspace = true } env_logger = { workspace = true } futures = { workspace = true } ipnetwork = { workspace = true } diff --git a/crates/guest/src/background.rs b/crates/guest/src/background.rs index d0cc092..8252a6f 100644 --- a/crates/guest/src/background.rs +++ b/crates/guest/src/background.rs @@ -3,6 +3,7 @@ use crate::{ death, }; use anyhow::Result; +use cgroups_rs::Cgroup; use krata::idm::{ client::IdmClient, protocol::{idm_event::Event, IdmEvent, IdmExitEvent, IdmPacket}, @@ -14,14 +15,16 @@ use tokio::select; pub struct GuestBackground { idm: IdmClient, child: Pid, + _cgroup: Cgroup, wait: ChildWait, } impl GuestBackground { - pub async fn new(idm: IdmClient, child: Pid) -> Result { + pub async fn new(idm: IdmClient, cgroup: Cgroup, child: Pid) -> Result { Ok(GuestBackground { idm, child, + _cgroup: cgroup, wait: ChildWait::new()?, }) } diff --git a/crates/guest/src/init.rs b/crates/guest/src/init.rs index 90f12c8..501964b 100644 --- a/crates/guest/src/init.rs +++ b/crates/guest/src/init.rs @@ -1,4 +1,7 @@ use anyhow::{anyhow, Result}; +use cgroups_rs::cgroup_builder::CgroupBuilder; +use cgroups_rs::devices::DeviceType; +use cgroups_rs::{Cgroup, CgroupPid}; use futures::stream::TryStreamExt; use ipnetwork::IpNetwork; use krata::ethtool::EthtoolHandle; @@ -7,6 +10,7 @@ use krata::launchcfg::{LaunchInfo, LaunchNetwork}; use libc::{setsid, TIOCSCTTY}; use log::{trace, warn}; use nix::ioctl_write_int_bad; +use nix::sys::stat::{major, minor}; use nix::unistd::{dup2, execve, fork, ForkResult, Pid}; use oci_spec::image::{Config, ImageConfiguration}; use path_absolutize::Absolutize; @@ -18,7 +22,7 @@ use std::net::{Ipv4Addr, Ipv6Addr}; use std::os::fd::AsRawFd; use std::os::linux::fs::MetadataExt; use std::os::unix::ffi::OsStrExt; -use std::os::unix::fs::{chroot, symlink, PermissionsExt}; +use std::os::unix::fs::{chroot, PermissionsExt}; use std::path::{Path, PathBuf}; use std::str::FromStr; use sys_mount::{FilesystemType, Mount, MountFlags}; @@ -112,14 +116,20 @@ impl GuestInit { trace!("early init"); self.create_dir("/dev", Some(0o0755)).await?; self.create_dir("/proc", None).await?; - self.create_dir("/sys", None).await?; + self.create_dir("/sys", Some(0o0555)).await?; self.create_dir("/root", Some(0o0700)).await?; self.create_dir("/tmp", None).await?; - self.mount_kernel_fs("devtmpfs", "/dev", "mode=0755") + self.create_dir("/run", Some(0o0755)).await?; + self.mount_kernel_fs("devtmpfs", "/dev", "mode=0755", None) + .await?; + self.mount_kernel_fs("proc", "/proc", "", None).await?; + self.mount_kernel_fs("sysfs", "/sys", "", None).await?; + fs::symlink("/proc/self/fd", "/dev/fd").await?; + fs::symlink("/proc/self/fd/0", "/dev/stdin").await?; + fs::symlink("/proc/self/fd/1", "/dev/stdout").await?; + fs::symlink("/proc/self/fd/2", "/dev/stderr").await?; + self.mount_kernel_fs("cgroup2", "/sys/fs/cgroup", "", Some(MountFlags::RELATIME)) .await?; - self.mount_kernel_fs("proc", "/proc", "").await?; - self.mount_kernel_fs("sysfs", "/sys", "").await?; - symlink("/proc/self/fd", "/dev/fd")?; Ok(()) } @@ -137,16 +147,19 @@ impl GuestInit { Ok(()) } - async fn mount_kernel_fs(&mut self, fstype: &str, path: &str, data: &str) -> Result<()> { - let metadata = fs::metadata(path).await?; - if metadata.st_dev() == fs::metadata("/").await?.st_dev() { - trace!("mounting kernel fs {} to {}", fstype, path); - Mount::builder() - .fstype(FilesystemType::Manual(fstype)) - .flags(MountFlags::NOEXEC | MountFlags::NOSUID) - .data(data) - .mount(fstype, path)?; - } + async fn mount_kernel_fs( + &mut self, + fstype: &str, + path: &str, + data: &str, + flags: Option, + ) -> Result<()> { + trace!("mounting kernel fs {} to {}", fstype, path); + Mount::builder() + .fstype(FilesystemType::Manual(fstype)) + .flags(MountFlags::NOEXEC | MountFlags::NOSUID | flags.unwrap_or(MountFlags::empty())) + .data(data) + .mount(fstype, path)?; Ok(()) } @@ -454,10 +467,44 @@ impl GuestInit { working_dir = "/".to_string(); } - self.fork_and_exec(idm, working_dir, path, cmd, env).await?; + let cgroup = self.init_cgroup().await?; + self.fork_and_exec(idm, cgroup, working_dir, path, cmd, env) + .await?; Ok(()) } + async fn init_cgroup(&self) -> Result { + trace!("initializing cgroup"); + let hierarchy = cgroups_rs::hierarchies::auto(); + let cgroup = CgroupBuilder::new("krata-guest-task"); + + let idm_device = fs::metadata("/dev/hvc1").await?.st_rdev(); + let config_block = fs::metadata(CONFIG_BLOCK_DEVICE_PATH).await?.st_rdev(); + + let cgroup = cgroup + .devices() + .device( + major(idm_device) as i64, + minor(idm_device) as i64, + DeviceType::All, + false, + Vec::new(), + ) + .device( + major(config_block) as i64, + minor(config_block) as i64, + DeviceType::All, + false, + Vec::new(), + ) + .done(); + + let cgroup = cgroup.build(hierarchy)?; + cgroup.set_cgroup_type("threaded")?; + trace!("initialized cgroup"); + Ok(cgroup) + } + fn strings_as_cstrings(values: Vec) -> Result> { let mut results: Vec = vec![]; for value in values { @@ -506,19 +553,21 @@ impl GuestInit { async fn fork_and_exec( &mut self, idm: IdmClient, + cgroup: Cgroup, working_dir: String, path: CString, cmd: Vec, env: Vec, ) -> Result<()> { match unsafe { fork()? } { - ForkResult::Parent { child } => self.background(idm, child).await, - ForkResult::Child => self.foreground(working_dir, path, cmd, env).await, + ForkResult::Parent { child } => self.background(idm, cgroup, child).await, + ForkResult::Child => self.foreground(cgroup, working_dir, path, cmd, env).await, } } async fn foreground( &mut self, + cgroup: Cgroup, working_dir: String, path: CString, cmd: Vec, @@ -526,6 +575,7 @@ impl GuestInit { ) -> Result<()> { GuestInit::set_controlling_terminal()?; std::env::set_current_dir(working_dir)?; + cgroup.add_task(CgroupPid::from(std::process::id() as u64))?; execve(&path, &cmd, &env)?; Ok(()) } @@ -538,8 +588,8 @@ impl GuestInit { Ok(()) } - async fn background(&mut self, idm: IdmClient, executed: Pid) -> Result<()> { - let mut background = GuestBackground::new(idm, executed).await?; + async fn background(&mut self, idm: IdmClient, cgroup: Cgroup, executed: Pid) -> Result<()> { + let mut background = GuestBackground::new(idm, cgroup, executed).await?; background.run().await?; Ok(()) }