]> git.proxmox.com Git - pve-lxc-syscalld.git/blame - src/pidfd.rs
clippy
[pve-lxc-syscalld.git] / src / pidfd.rs
CommitLineData
e420f6f9
WB
1//! pidfd helper functionality
2
512f780a
WB
3use std::collections::HashMap;
4use std::ffi::{CStr, CString, OsStr, OsString};
5use std::io::{self, BufRead, BufReader};
937921aa 6use std::os::raw::c_int;
512f780a 7use std::os::unix::ffi::{OsStrExt, OsStringExt};
e420f6f9
WB
8use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd};
9
512f780a
WB
10use failure::{bail, Error};
11use libc::pid_t;
12
7ca1a14c 13use crate::c_try;
e420f6f9
WB
14use crate::nsfd::{ns_type, NsFd};
15use crate::tools::Fd;
e420f6f9 16
512f780a
WB
17pub struct PidFd(RawFd, pid_t);
18crate::file_descriptor_impl!(PidFd);
19
20#[derive(Default)]
21pub struct Uids {
22 pub ruid: libc::uid_t,
23 pub euid: libc::uid_t,
24 pub suid: libc::uid_t,
25 pub fsuid: libc::uid_t,
26 pub rgid: libc::gid_t,
27 pub egid: libc::gid_t,
28 pub sgid: libc::gid_t,
29 pub fsgid: libc::gid_t,
30}
31
32#[derive(Clone, Default)]
33pub struct Capabilities {
34 inheritable: u64,
35 permitted: u64,
36 effective: u64,
37 //bounding: u64, // we don't care currently
38}
39
40#[derive(Default)]
41pub struct ProcStatus {
42 uids: Uids,
43 capabilities: Capabilities,
44 umask: libc::mode_t,
45}
e420f6f9 46
1349eed4
WB
47pub struct IdMapEntry {
48 ns: u64,
49 host: u64,
50 range: u64,
51}
52
53pub struct IdMap(Vec<IdMapEntry>);
54
55impl IdMap {
56 pub fn map_into(&self, id: u64) -> Option<u64> {
57 for entry in self.0.iter() {
58 if entry.host <= id && entry.host + entry.range > id {
59 return Some(entry.ns + id - entry.host);
60 }
61 }
62
63 None
64 }
65
66 pub fn map_from(&self, id: u64) -> Option<u64> {
67 for entry in self.0.iter() {
68 if entry.ns <= id && entry.ns + entry.range > id {
69 return Some(id + entry.host);
70 }
71 }
72
73 None
74 }
75}
76
e420f6f9 77impl PidFd {
42f25756 78 pub fn current() -> io::Result<Self> {
7ca1a14c 79 let fd = c_try!(unsafe {
942f3c73
WB
80 libc::open(
81 b"/proc/self\0".as_ptr() as _,
82 libc::O_DIRECTORY | libc::O_CLOEXEC,
83 )
42f25756
WB
84 });
85
86 Ok(Self(fd, unsafe { libc::getpid() }))
87 }
88
512f780a 89 pub fn open(pid: pid_t) -> io::Result<Self> {
e420f6f9
WB
90 let path = CString::new(format!("/proc/{}", pid)).unwrap();
91
e935a000 92 let fd = c_try!(unsafe { libc::open(path.as_ptr(), libc::O_DIRECTORY | libc::O_CLOEXEC) });
e420f6f9 93
512f780a
WB
94 Ok(Self(fd, pid))
95 }
96
97 pub unsafe fn try_from_fd(fd: Fd) -> io::Result<Self> {
98 let mut this = Self(fd.into_raw_fd(), -1 as pid_t);
99 let pid = this.read_pid()?;
100 this.1 = pid;
101 Ok(this)
e420f6f9
WB
102 }
103
104 pub fn mount_namespace(&self) -> io::Result<NsFd<ns_type::Mount>> {
275009ec
WB
105 NsFd::openat(self.0, unsafe {
106 CStr::from_bytes_with_nul_unchecked(b"ns/mnt\0")
107 })
e420f6f9
WB
108 }
109
110 pub fn cgroup_namespace(&self) -> io::Result<NsFd<ns_type::Cgroup>> {
275009ec
WB
111 NsFd::openat(self.0, unsafe {
112 CStr::from_bytes_with_nul_unchecked(b"ns/cgroup\0")
113 })
e420f6f9
WB
114 }
115
116 pub fn user_namespace(&self) -> io::Result<NsFd<ns_type::User>> {
275009ec
WB
117 NsFd::openat(self.0, unsafe {
118 CStr::from_bytes_with_nul_unchecked(b"ns/user\0")
119 })
e420f6f9
WB
120 }
121
3bb4df0b 122 fn fd(&self, path: &CStr, flags: c_int, mode: c_int) -> io::Result<Fd> {
7ca1a14c 123 Ok(Fd(c_try!(unsafe {
e420f6f9
WB
124 libc::openat(
125 self.as_raw_fd(),
937921aa
WB
126 path.as_ptr() as *const _,
127 flags | libc::O_CLOEXEC,
3bb4df0b 128 mode,
e420f6f9
WB
129 )
130 })))
131 }
937921aa
WB
132
133 pub fn fd_cwd(&self) -> io::Result<Fd> {
512f780a
WB
134 self.fd(
135 unsafe { CStr::from_bytes_with_nul_unchecked(b"cwd\0") },
136 libc::O_DIRECTORY,
137 0,
138 )
937921aa
WB
139 }
140
141 pub fn fd_num(&self, num: RawFd, flags: c_int) -> io::Result<Fd> {
3bb4df0b 142 let path = format!("fd/{}\0", num);
512f780a
WB
143 self.fd(
144 unsafe { CStr::from_bytes_with_nul_unchecked(path.as_bytes()) },
145 flags,
146 0,
147 )
937921aa 148 }
275009ec 149
bff40ab9 150 pub fn enter_cwd(&self) -> io::Result<()> {
7ca1a14c 151 c_try!(unsafe { libc::fchdir(self.fd_cwd()?.as_raw_fd()) });
bff40ab9
WB
152 Ok(())
153 }
154
155 pub fn enter_chroot(&self) -> io::Result<()> {
7ca1a14c
WB
156 c_try!(unsafe { libc::fchdir(self.as_raw_fd()) });
157 c_try!(unsafe { libc::chroot(b"root\0".as_ptr() as *const _) });
158 c_try!(unsafe { libc::chdir(b"/\0".as_ptr() as *const _) });
512f780a
WB
159 Ok(())
160 }
3bb4df0b
WB
161
162 // procfs files cannot be async, we cannot add them to epoll...
163 pub fn open_file(&self, path: &CStr, flags: c_int, mode: c_int) -> io::Result<std::fs::File> {
164 Ok(unsafe { std::fs::File::from_raw_fd(self.fd(path, flags, mode)?.into_raw_fd()) })
165 }
166
512f780a
WB
167 #[inline]
168 fn open_buffered(&self, path: &CStr) -> io::Result<impl BufRead> {
169 Ok(BufReader::new(self.open_file(
170 path,
3bb4df0b
WB
171 libc::O_RDONLY | libc::O_CLOEXEC,
172 0,
512f780a
WB
173 )?))
174 }
175
176 #[inline]
177 pub fn get_pid(&self) -> pid_t {
178 self.1
179 }
180
181 fn read_pid(&self) -> io::Result<pid_t> {
182 let reader =
183 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?;
184
185 for line in reader.lines() {
186 let line = line?;
187 let mut parts = line.split_ascii_whitespace();
188 if parts.next() == Some("Pid:") {
189 let pid = parts
190 .next()
191 .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Pid:' line in proc"))?
192 .parse::<pid_t>()
193 .map_err(|_| {
194 io::Error::new(io::ErrorKind::Other, "failed to parse pid from proc")
195 })?;
196 return Ok(pid);
197 }
198 }
199
200 Err(io::ErrorKind::NotFound.into())
201 }
202
1349eed4
WB
203 #[inline]
204 fn __check_uid_gid(value: Option<&str>) -> io::Result<libc::uid_t> {
205 value
206 .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Uid/Gid:' line in proc"))?
207 .parse::<libc::uid_t>()
208 .map_err(|_| io::Error::new(io::ErrorKind::Other, "failed to parse uid from proc"))
209 }
210
512f780a
WB
211 pub fn get_status(&self) -> io::Result<ProcStatus> {
212 let reader =
213 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?;
214
512f780a
WB
215 #[inline]
216 fn check_u64_hex(value: Option<&str>) -> io::Result<u64> {
217 Ok(u64::from_str_radix(
218 value.ok_or_else(|| {
219 io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc")
220 })?,
221 16,
222 )
223 .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?)
224 }
225
226 #[inline]
227 fn check_u32_oct(value: Option<&str>) -> io::Result<u32> {
228 Ok(u32::from_str_radix(
229 value.ok_or_else(|| {
230 io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc")
231 })?,
232 8,
233 )
234 .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?)
235 }
3bb4df0b 236
512f780a
WB
237 let mut ids = Uids::default();
238 let mut caps = Capabilities::default();
239 let mut umask = 0o022;
3bb4df0b
WB
240 for line in reader.lines() {
241 let line = line?;
242 let mut parts = line.split_ascii_whitespace();
243 match parts.next() {
244 Some("Uid:") => {
1349eed4
WB
245 ids.ruid = Self::__check_uid_gid(parts.next())?;
246 ids.euid = Self::__check_uid_gid(parts.next())?;
247 ids.suid = Self::__check_uid_gid(parts.next())?;
248 ids.fsuid = Self::__check_uid_gid(parts.next())?;
3bb4df0b
WB
249 }
250 Some("Gid:") => {
1349eed4
WB
251 ids.rgid = Self::__check_uid_gid(parts.next())?;
252 ids.egid = Self::__check_uid_gid(parts.next())?;
253 ids.sgid = Self::__check_uid_gid(parts.next())?;
254 ids.fsgid = Self::__check_uid_gid(parts.next())?;
3bb4df0b 255 }
512f780a
WB
256 Some("CapInh:") => caps.inheritable = check_u64_hex(parts.next())?,
257 Some("CapPrm:") => caps.permitted = check_u64_hex(parts.next())?,
258 Some("CapEff:") => caps.effective = check_u64_hex(parts.next())?,
259 //Some("CapBnd:") => caps.bounding = check_u64_hex(parts.next())?,
260 Some("Umask:") => umask = check_u32_oct(parts.next())?,
3bb4df0b
WB
261 _ => continue,
262 }
512f780a
WB
263 }
264
265 Ok(ProcStatus {
266 uids: ids,
267 capabilities: caps,
268 umask,
269 })
270 }
271
272 pub fn get_cgroups(&self) -> Result<CGroups, Error> {
273 let reader =
274 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"cgroup\0") })?;
275
276 let mut cgroups = CGroups::new();
277
278 for line in reader.split(b'\n') {
279 let line = line?;
280 let mut parts = line.splitn(3, |b| *b == b':');
281 let num = parts.next();
282 let name = parts.next();
283 let path = parts.next();
f3cae2a7 284 if num.is_none() || name.is_none() || path.is_none() || parts.next().is_some() {
512f780a 285 bail!("failed to parse cgroup line: {:?}", line);
3bb4df0b 286 }
512f780a
WB
287
288 let name = String::from_utf8(name.unwrap().to_vec())?;
289 let path = OsString::from_vec(path.unwrap().to_vec());
290
291 if name.len() == 0 {
292 cgroups.v2 = Some(path);
293 } else {
294 for entry in name.split(',') {
295 cgroups.v1.insert(entry.to_string(), path.clone());
296 }
297 }
298 }
299
300 Ok(cgroups)
301 }
302
1349eed4
WB
303 pub fn get_uid_gid_map(&self, file: &CStr) -> Result<IdMap, Error> {
304 let reader = self.open_buffered(file)?;
305
306 let mut entries = Vec::new();
307 for line in reader.lines() {
308 let line = line?;
309 let mut parts = line.split_ascii_whitespace();
310 let ns = Self::__check_uid_gid(parts.next())? as u64;
311 let host = Self::__check_uid_gid(parts.next())? as u64;
312 let range = Self::__check_uid_gid(parts.next())? as u64;
313 entries.push(IdMapEntry { ns, host, range });
314 }
315
316 Ok(IdMap(entries))
317 }
318
319 pub fn get_uid_map(&self) -> Result<IdMap, Error> {
320 self.get_uid_gid_map(unsafe { CStr::from_bytes_with_nul_unchecked(b"uid_map\0") })
321 }
322
323 pub fn get_gid_map(&self) -> Result<IdMap, Error> {
324 self.get_uid_gid_map(unsafe { CStr::from_bytes_with_nul_unchecked(b"gid_map\0") })
325 }
326
42f25756
WB
327 pub fn read_file(&self, file: &CStr) -> io::Result<Vec<u8>> {
328 use io::Read;
329
330 let mut reader = self.open_file(file, libc::O_RDONLY | libc::O_CLOEXEC, 0)?;
331 let mut out = Vec::new();
332 reader.read_to_end(&mut out)?;
333 Ok(out)
334 }
335
512f780a
WB
336 pub fn user_caps(&self) -> Result<UserCaps, Error> {
337 UserCaps::new(self)
338 }
339}
340
341pub struct CGroups {
342 v1: HashMap<String, OsString>,
343 v2: Option<OsString>,
344}
345
346impl CGroups {
347 fn new() -> Self {
348 Self {
349 v1: HashMap::new(),
350 v2: None,
3bb4df0b 351 }
512f780a
WB
352 }
353
354 pub fn get(&self, name: &str) -> Option<&OsStr> {
355 self.v1.get(name).map(|s| s.as_os_str())
356 }
357
358 pub fn v2(&self) -> Option<&OsStr> {
359 self.v2.as_ref().map(|s| s.as_os_str())
360 }
361}
362
363// Too lazy to bindgen libcap stuff...
bd05b957 364const CAPABILITY_VERSION_3: u32 = 0x2008_0522;
512f780a
WB
365
366/// Represents process capabilities.
367///
368/// This can be used to change the process' capability sets (if permitted by the kernel).
369impl Capabilities {
370 // We currently don't implement capget as it takes a pid which is racy on kernels without pidfd
371 // support. Later on we might support a `capget(&PidFd)` method?
372
373 /// Change our process capabilities. This does not include the bounding set.
374 pub fn capset(&self) -> io::Result<()> {
375 #![allow(dead_code)]
376 // kernel abi:
377 struct Header {
378 version: u32,
379 pid: c_int,
380 }
381
382 struct Data {
383 effective: u32,
384 permitted: u32,
385 inheritable: u32,
386 }
387
388 let header = Header {
389 version: CAPABILITY_VERSION_3,
390 pid: 0, // equivalent to gettid(),
391 };
3bb4df0b 392
512f780a
WB
393 let data = [
394 Data {
395 effective: self.effective as u32,
396 permitted: self.permitted as u32,
397 inheritable: self.inheritable as u32,
398 },
399 Data {
400 effective: (self.effective >> 32) as u32,
401 permitted: (self.permitted >> 32) as u32,
402 inheritable: (self.inheritable >> 32) as u32,
403 },
404 ];
405
7ca1a14c 406 c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) });
512f780a
WB
407
408 Ok(())
409 }
512f780a
WB
410}
411
412/// Helper to enter a process' permission-check environment.
413///
414/// When we execute a syscall on behalf of another process, we should try to trigger as many
415/// permission checks as we can. It is impractical to implement them all manually, so the best
416/// thing to do is cause as many of them to happen on the kernel-side as we can.
417///
bff40ab9
WB
418/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be
419/// affected, and access to devices as well.
420///
421/// Then we must enter the mount namespace, chroot and current working directory, in order to get
942f3c73 422/// the correct view of paths.
bff40ab9
WB
423///
424/// Next we copy the caller's `umask`.
512f780a 425///
bff40ab9
WB
426/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means
427/// we do not need to run `chown()` on files we create, secondly, the user may have dropped
512f780a
WB
428/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in
429/// the first place (for example, the container program may be a non-root executable with
430/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run
bff40ab9 431/// `mknod()` on a path owned by different user (and checking file system permissions would
512f780a 432/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better
bff40ab9 433/// leave all that up to the kernel, too!)).
512f780a 434///
bff40ab9
WB
435/// Next we clone the process' capability set. This is because the process may have dropped
436/// capabilties which under normal conditions would prevent them from executing the syscall. For
437/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`.
512f780a
WB
438#[derive(Clone)]
439#[must_use = "not using UserCaps may be a security issue"]
bff40ab9
WB
440pub struct UserCaps<'a> {
441 pidfd: &'a PidFd,
7470f14b 442 apply_uids: bool,
512f780a
WB
443 euid: libc::uid_t,
444 egid: libc::gid_t,
445 fsuid: libc::uid_t,
446 fsgid: libc::gid_t,
447 capabilities: Capabilities,
448 umask: libc::mode_t,
449 cgroup_v1_devices: Option<OsString>,
450 cgroup_v2: Option<OsString>,
42f25756 451 apparmor_profile: Option<OsString>,
512f780a
WB
452}
453
bff40ab9 454impl UserCaps<'_> {
512f780a
WB
455 pub fn new(pidfd: &PidFd) -> Result<UserCaps, Error> {
456 let status = pidfd.get_status()?;
457 let cgroups = pidfd.get_cgroups()?;
42f25756 458 let apparmor_profile = crate::apparmor::get_label(pidfd)?;
512f780a
WB
459
460 Ok(UserCaps {
bff40ab9 461 pidfd,
7470f14b 462 apply_uids: true,
512f780a
WB
463 euid: status.uids.euid,
464 egid: status.uids.egid,
465 fsuid: status.uids.fsuid,
466 fsgid: status.uids.fsgid,
467 capabilities: status.capabilities,
468 umask: status.umask,
469 cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()),
470 cgroup_v2: cgroups.v2().map(|s| s.to_owned()),
42f25756 471 apparmor_profile,
512f780a
WB
472 })
473 }
474
bff40ab9 475 fn apply_cgroups(&self) -> io::Result<()> {
512f780a
WB
476 fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> {
477 let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1);
478 path.push(OsStr::from_bytes(b"/sys/fs/cgroup/"));
479 path.push(kind);
480 path.push(name);
481 path.push(OsStr::from_bytes(b"/cgroup.procs"));
482 std::fs::write(path, b"0")
483 }
484
485 if let Some(ref cg) = self.cgroup_v1_devices {
486 enter_cgroup("devices/", cg)?;
487 }
488
489 if let Some(ref cg) = self.cgroup_v2 {
490 enter_cgroup("unified/", cg)?;
491 }
492
493 Ok(())
494 }
495
bff40ab9 496 fn apply_user_caps(&self) -> io::Result<()> {
738dbfbe 497 use crate::capability::SecureBits;
7470f14b
WB
498 if self.apply_uids {
499 unsafe {
500 libc::umask(self.umask);
501 }
502 let mut secbits = SecureBits::get_current()?;
503 secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP;
504 secbits.apply()?;
505 c_try!(unsafe { libc::setegid(self.egid) });
506 c_try!(unsafe { libc::setfsgid(self.fsgid) });
507 c_try!(unsafe { libc::seteuid(self.euid) });
508 c_try!(unsafe { libc::setfsuid(self.fsuid) });
512f780a 509 }
512f780a
WB
510 self.capabilities.capset()?;
511 Ok(())
3bb4df0b 512 }
bff40ab9 513
7470f14b
WB
514 pub fn disable_uid_change(&mut self) {
515 self.apply_uids = false;
516 }
517
518 pub fn disable_cgroup_change(&mut self) {
519 self.cgroup_v1_devices = None;
520 self.cgroup_v2 = None;
521 }
522
42f25756 523 pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> {
bff40ab9
WB
524 self.apply_cgroups()?;
525 self.pidfd.mount_namespace()?.setns()?;
526 self.pidfd.enter_chroot()?;
527 self.pidfd.enter_cwd()?;
42f25756
WB
528 if let Some(ref label) = self.apparmor_profile {
529 crate::apparmor::set_label(own_pidfd, label)?;
530 }
bff40ab9
WB
531 self.apply_user_caps()?;
532 Ok(())
533 }
e420f6f9 534}