]> git.proxmox.com Git - pve-lxc-syscalld.git/blame - src/process/user_caps.rs
support pure cgroupv2 environments
[pve-lxc-syscalld.git] / src / process / user_caps.rs
CommitLineData
3bbd1db0
WB
1//! User and capability management.
2
3use std::ffi::{OsStr, OsString};
4use std::io;
5use std::os::unix::ffi::OsStrExt;
6
8150a439 7use anyhow::Error;
3bbd1db0
WB
8
9use super::PidFd;
10use crate::capability::Capabilities;
11
12/// Helper to enter a process' permission-check environment.
13///
14/// When we execute a syscall on behalf of another process, we should try to trigger as many
15/// permission checks as we can. It is impractical to implement them all manually, so the best
16/// thing to do is cause as many of them to happen on the kernel-side as we can.
17///
18/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be
19/// affected, and access to devices as well.
20///
21/// Then we must enter the mount namespace, chroot and current working directory, in order to get
22/// the correct view of paths.
23///
24/// Next we copy the caller's `umask`.
25///
26/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means
27/// we do not need to run `chown()` on files we create, secondly, the user may have dropped
28/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in
29/// the first place (for example, the container program may be a non-root executable with
30/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run
31/// `mknod()` on a path owned by different user (and checking file system permissions would
32/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better
33/// leave all that up to the kernel, too!)).
34///
35/// Next we clone the process' capability set. This is because the process may have dropped
36/// capabilties which under normal conditions would prevent them from executing the syscall. For
37/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`.
38#[derive(Clone)]
39#[must_use = "not using UserCaps may be a security issue"]
40pub struct UserCaps<'a> {
41 pidfd: &'a PidFd,
42 apply_uids: bool,
43 euid: libc::uid_t,
44 egid: libc::gid_t,
45 fsuid: libc::uid_t,
46 fsgid: libc::gid_t,
47 capabilities: Capabilities,
48 umask: libc::mode_t,
49 cgroup_v1_devices: Option<OsString>,
fe73c2fb 50 cgroup_v2_base: &'static str,
3bbd1db0
WB
51 cgroup_v2: Option<OsString>,
52 apparmor_profile: Option<OsString>,
53}
54
55impl UserCaps<'_> {
56 pub fn new(pidfd: &PidFd) -> Result<UserCaps, Error> {
57 let status = pidfd.get_status()?;
58 let cgroups = pidfd.get_cgroups()?;
59 let apparmor_profile = crate::apparmor::get_label(pidfd)?;
60
61 Ok(UserCaps {
62 pidfd,
63 apply_uids: true,
64 euid: status.uids.euid,
65 egid: status.uids.egid,
66 fsuid: status.uids.fsuid,
67 fsgid: status.uids.fsgid,
68 capabilities: status.capabilities,
69 umask: status.umask,
70 cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()),
fe73c2fb 71 cgroup_v2_base: if cgroups.has_v1() { "unified/" } else { "" },
3bbd1db0
WB
72 cgroup_v2: cgroups.v2().map(|s| s.to_owned()),
73 apparmor_profile,
74 })
75 }
76
77 fn apply_cgroups(&self) -> io::Result<()> {
fe73c2fb
WB
78 // FIXME: Handle `kind` taking /proc/self/mountinfo into account instead of assuming
79 // "unified/"
3bbd1db0
WB
80 fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> {
81 let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1);
82 path.push(OsStr::from_bytes(b"/sys/fs/cgroup/"));
83 path.push(kind);
84 path.push(name);
85 path.push(OsStr::from_bytes(b"/cgroup.procs"));
86 std::fs::write(path, b"0")
87 }
88
89 if let Some(ref cg) = self.cgroup_v1_devices {
90 enter_cgroup("devices/", cg)?;
91 }
92
93 if let Some(ref cg) = self.cgroup_v2 {
fe73c2fb 94 enter_cgroup(self.cgroup_v2_base, cg)?;
3bbd1db0
WB
95 }
96
97 Ok(())
98 }
99
100 fn apply_user_caps(&self) -> io::Result<()> {
101 use crate::capability::SecureBits;
102 if self.apply_uids {
103 unsafe {
104 libc::umask(self.umask);
105 }
106 let mut secbits = SecureBits::get_current()?;
107 secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP;
108 secbits.apply()?;
109 c_try!(unsafe { libc::setegid(self.egid) });
110 c_try!(unsafe { libc::setfsgid(self.fsgid) });
111 c_try!(unsafe { libc::seteuid(self.euid) });
112 c_try!(unsafe { libc::setfsuid(self.fsuid) });
113 }
114 self.capabilities.capset()?;
115 Ok(())
116 }
117
118 pub fn disable_uid_change(&mut self) {
119 self.apply_uids = false;
120 }
121
122 pub fn disable_cgroup_change(&mut self) {
123 self.cgroup_v1_devices = None;
124 self.cgroup_v2 = None;
125 }
126
127 pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> {
128 self.apply_cgroups()?;
129 self.pidfd.mount_namespace()?.setns()?;
130 self.pidfd.enter_chroot()?;
131 self.pidfd.enter_cwd()?;
132 if let Some(ref label) = self.apparmor_profile {
133 crate::apparmor::set_label(own_pidfd, label)?;
134 }
135 self.apply_user_caps()?;
136 Ok(())
137 }
138}