]> git.proxmox.com Git - pve-lxc-syscalld.git/blame - src/pidfd.rs
introduce c_str macro
[pve-lxc-syscalld.git] / src / pidfd.rs
CommitLineData
e420f6f9
WB
1//! pidfd helper functionality
2
512f780a
WB
3use std::collections::HashMap;
4use std::ffi::{CStr, CString, OsStr, OsString};
5use std::io::{self, BufRead, BufReader};
937921aa 6use std::os::raw::c_int;
512f780a 7use std::os::unix::ffi::{OsStrExt, OsStringExt};
e420f6f9
WB
8use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd};
9
512f780a
WB
10use failure::{bail, Error};
11use libc::pid_t;
12
e420f6f9
WB
13use crate::nsfd::{ns_type, NsFd};
14use crate::tools::Fd;
e420f6f9 15
512f780a 16pub struct PidFd(RawFd, pid_t);
9aa2a15a 17file_descriptor_impl!(PidFd);
512f780a
WB
18
19#[derive(Default)]
20pub struct Uids {
21 pub ruid: libc::uid_t,
22 pub euid: libc::uid_t,
23 pub suid: libc::uid_t,
24 pub fsuid: libc::uid_t,
25 pub rgid: libc::gid_t,
26 pub egid: libc::gid_t,
27 pub sgid: libc::gid_t,
28 pub fsgid: libc::gid_t,
29}
30
31#[derive(Clone, Default)]
32pub struct Capabilities {
33 inheritable: u64,
34 permitted: u64,
35 effective: u64,
36 //bounding: u64, // we don't care currently
37}
38
39#[derive(Default)]
40pub struct ProcStatus {
41 uids: Uids,
42 capabilities: Capabilities,
43 umask: libc::mode_t,
44}
e420f6f9 45
1349eed4
WB
46pub struct IdMapEntry {
47 ns: u64,
48 host: u64,
49 range: u64,
50}
51
52pub struct IdMap(Vec<IdMapEntry>);
53
54impl IdMap {
55 pub fn map_into(&self, id: u64) -> Option<u64> {
56 for entry in self.0.iter() {
57 if entry.host <= id && entry.host + entry.range > id {
58 return Some(entry.ns + id - entry.host);
59 }
60 }
61
62 None
63 }
64
65 pub fn map_from(&self, id: u64) -> Option<u64> {
66 for entry in self.0.iter() {
67 if entry.ns <= id && entry.ns + entry.range > id {
68 return Some(id + entry.host);
69 }
70 }
71
72 None
73 }
74}
75
e420f6f9 76impl PidFd {
42f25756 77 pub fn current() -> io::Result<Self> {
7ca1a14c 78 let fd = c_try!(unsafe {
942f3c73
WB
79 libc::open(
80 b"/proc/self\0".as_ptr() as _,
81 libc::O_DIRECTORY | libc::O_CLOEXEC,
82 )
42f25756
WB
83 });
84
85 Ok(Self(fd, unsafe { libc::getpid() }))
86 }
87
512f780a 88 pub fn open(pid: pid_t) -> io::Result<Self> {
e420f6f9
WB
89 let path = CString::new(format!("/proc/{}", pid)).unwrap();
90
e935a000 91 let fd = c_try!(unsafe { libc::open(path.as_ptr(), libc::O_DIRECTORY | libc::O_CLOEXEC) });
e420f6f9 92
512f780a
WB
93 Ok(Self(fd, pid))
94 }
95
96 pub unsafe fn try_from_fd(fd: Fd) -> io::Result<Self> {
97 let mut this = Self(fd.into_raw_fd(), -1 as pid_t);
98 let pid = this.read_pid()?;
99 this.1 = pid;
100 Ok(this)
e420f6f9
WB
101 }
102
103 pub fn mount_namespace(&self) -> io::Result<NsFd<ns_type::Mount>> {
275009ec
WB
104 NsFd::openat(self.0, unsafe {
105 CStr::from_bytes_with_nul_unchecked(b"ns/mnt\0")
106 })
e420f6f9
WB
107 }
108
109 pub fn cgroup_namespace(&self) -> io::Result<NsFd<ns_type::Cgroup>> {
275009ec
WB
110 NsFd::openat(self.0, unsafe {
111 CStr::from_bytes_with_nul_unchecked(b"ns/cgroup\0")
112 })
e420f6f9
WB
113 }
114
115 pub fn user_namespace(&self) -> io::Result<NsFd<ns_type::User>> {
275009ec
WB
116 NsFd::openat(self.0, unsafe {
117 CStr::from_bytes_with_nul_unchecked(b"ns/user\0")
118 })
e420f6f9
WB
119 }
120
3bb4df0b 121 fn fd(&self, path: &CStr, flags: c_int, mode: c_int) -> io::Result<Fd> {
7ca1a14c 122 Ok(Fd(c_try!(unsafe {
e420f6f9
WB
123 libc::openat(
124 self.as_raw_fd(),
937921aa
WB
125 path.as_ptr() as *const _,
126 flags | libc::O_CLOEXEC,
3bb4df0b 127 mode,
e420f6f9
WB
128 )
129 })))
130 }
937921aa
WB
131
132 pub fn fd_cwd(&self) -> io::Result<Fd> {
512f780a
WB
133 self.fd(
134 unsafe { CStr::from_bytes_with_nul_unchecked(b"cwd\0") },
135 libc::O_DIRECTORY,
136 0,
137 )
937921aa
WB
138 }
139
140 pub fn fd_num(&self, num: RawFd, flags: c_int) -> io::Result<Fd> {
3bb4df0b 141 let path = format!("fd/{}\0", num);
512f780a
WB
142 self.fd(
143 unsafe { CStr::from_bytes_with_nul_unchecked(path.as_bytes()) },
144 flags,
145 0,
146 )
937921aa 147 }
275009ec 148
bff40ab9 149 pub fn enter_cwd(&self) -> io::Result<()> {
7ca1a14c 150 c_try!(unsafe { libc::fchdir(self.fd_cwd()?.as_raw_fd()) });
bff40ab9
WB
151 Ok(())
152 }
153
154 pub fn enter_chroot(&self) -> io::Result<()> {
7ca1a14c
WB
155 c_try!(unsafe { libc::fchdir(self.as_raw_fd()) });
156 c_try!(unsafe { libc::chroot(b"root\0".as_ptr() as *const _) });
157 c_try!(unsafe { libc::chdir(b"/\0".as_ptr() as *const _) });
512f780a
WB
158 Ok(())
159 }
3bb4df0b
WB
160
161 // procfs files cannot be async, we cannot add them to epoll...
162 pub fn open_file(&self, path: &CStr, flags: c_int, mode: c_int) -> io::Result<std::fs::File> {
163 Ok(unsafe { std::fs::File::from_raw_fd(self.fd(path, flags, mode)?.into_raw_fd()) })
164 }
165
512f780a
WB
166 #[inline]
167 fn open_buffered(&self, path: &CStr) -> io::Result<impl BufRead> {
168 Ok(BufReader::new(self.open_file(
169 path,
3bb4df0b
WB
170 libc::O_RDONLY | libc::O_CLOEXEC,
171 0,
512f780a
WB
172 )?))
173 }
174
175 #[inline]
176 pub fn get_pid(&self) -> pid_t {
177 self.1
178 }
179
180 fn read_pid(&self) -> io::Result<pid_t> {
181 let reader =
182 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?;
183
184 for line in reader.lines() {
185 let line = line?;
186 let mut parts = line.split_ascii_whitespace();
187 if parts.next() == Some("Pid:") {
188 let pid = parts
189 .next()
190 .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Pid:' line in proc"))?
191 .parse::<pid_t>()
192 .map_err(|_| {
193 io::Error::new(io::ErrorKind::Other, "failed to parse pid from proc")
194 })?;
195 return Ok(pid);
196 }
197 }
198
199 Err(io::ErrorKind::NotFound.into())
200 }
201
1349eed4
WB
202 #[inline]
203 fn __check_uid_gid(value: Option<&str>) -> io::Result<libc::uid_t> {
204 value
205 .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "bad 'Uid/Gid:' line in proc"))?
206 .parse::<libc::uid_t>()
207 .map_err(|_| io::Error::new(io::ErrorKind::Other, "failed to parse uid from proc"))
208 }
209
512f780a
WB
210 pub fn get_status(&self) -> io::Result<ProcStatus> {
211 let reader =
212 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"status\0") })?;
213
512f780a
WB
214 #[inline]
215 fn check_u64_hex(value: Option<&str>) -> io::Result<u64> {
216 Ok(u64::from_str_radix(
217 value.ok_or_else(|| {
218 io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc")
219 })?,
220 16,
221 )
222 .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?)
223 }
224
225 #[inline]
226 fn check_u32_oct(value: Option<&str>) -> io::Result<u32> {
227 Ok(u32::from_str_radix(
228 value.ok_or_else(|| {
229 io::Error::new(io::ErrorKind::Other, "bad numeric property line in proc")
230 })?,
231 8,
232 )
233 .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?)
234 }
3bb4df0b 235
512f780a
WB
236 let mut ids = Uids::default();
237 let mut caps = Capabilities::default();
238 let mut umask = 0o022;
3bb4df0b
WB
239 for line in reader.lines() {
240 let line = line?;
241 let mut parts = line.split_ascii_whitespace();
242 match parts.next() {
243 Some("Uid:") => {
1349eed4
WB
244 ids.ruid = Self::__check_uid_gid(parts.next())?;
245 ids.euid = Self::__check_uid_gid(parts.next())?;
246 ids.suid = Self::__check_uid_gid(parts.next())?;
247 ids.fsuid = Self::__check_uid_gid(parts.next())?;
3bb4df0b
WB
248 }
249 Some("Gid:") => {
1349eed4
WB
250 ids.rgid = Self::__check_uid_gid(parts.next())?;
251 ids.egid = Self::__check_uid_gid(parts.next())?;
252 ids.sgid = Self::__check_uid_gid(parts.next())?;
253 ids.fsgid = Self::__check_uid_gid(parts.next())?;
3bb4df0b 254 }
512f780a
WB
255 Some("CapInh:") => caps.inheritable = check_u64_hex(parts.next())?,
256 Some("CapPrm:") => caps.permitted = check_u64_hex(parts.next())?,
257 Some("CapEff:") => caps.effective = check_u64_hex(parts.next())?,
258 //Some("CapBnd:") => caps.bounding = check_u64_hex(parts.next())?,
259 Some("Umask:") => umask = check_u32_oct(parts.next())?,
3bb4df0b
WB
260 _ => continue,
261 }
512f780a
WB
262 }
263
264 Ok(ProcStatus {
265 uids: ids,
266 capabilities: caps,
267 umask,
268 })
269 }
270
271 pub fn get_cgroups(&self) -> Result<CGroups, Error> {
272 let reader =
273 self.open_buffered(unsafe { CStr::from_bytes_with_nul_unchecked(b"cgroup\0") })?;
274
275 let mut cgroups = CGroups::new();
276
277 for line in reader.split(b'\n') {
278 let line = line?;
279 let mut parts = line.splitn(3, |b| *b == b':');
280 let num = parts.next();
281 let name = parts.next();
282 let path = parts.next();
f3cae2a7 283 if num.is_none() || name.is_none() || path.is_none() || parts.next().is_some() {
512f780a 284 bail!("failed to parse cgroup line: {:?}", line);
3bb4df0b 285 }
512f780a
WB
286
287 let name = String::from_utf8(name.unwrap().to_vec())?;
288 let path = OsString::from_vec(path.unwrap().to_vec());
289
9486338a 290 if name.is_empty() {
512f780a
WB
291 cgroups.v2 = Some(path);
292 } else {
293 for entry in name.split(',') {
294 cgroups.v1.insert(entry.to_string(), path.clone());
295 }
296 }
297 }
298
299 Ok(cgroups)
300 }
301
1349eed4
WB
302 pub fn get_uid_gid_map(&self, file: &CStr) -> Result<IdMap, Error> {
303 let reader = self.open_buffered(file)?;
304
305 let mut entries = Vec::new();
306 for line in reader.lines() {
307 let line = line?;
308 let mut parts = line.split_ascii_whitespace();
9486338a
WB
309 let ns = u64::from(Self::__check_uid_gid(parts.next())?);
310 let host = u64::from(Self::__check_uid_gid(parts.next())?);
311 let range = u64::from(Self::__check_uid_gid(parts.next())?);
1349eed4
WB
312 entries.push(IdMapEntry { ns, host, range });
313 }
314
315 Ok(IdMap(entries))
316 }
317
318 pub fn get_uid_map(&self) -> Result<IdMap, Error> {
319 self.get_uid_gid_map(unsafe { CStr::from_bytes_with_nul_unchecked(b"uid_map\0") })
320 }
321
322 pub fn get_gid_map(&self) -> Result<IdMap, Error> {
323 self.get_uid_gid_map(unsafe { CStr::from_bytes_with_nul_unchecked(b"gid_map\0") })
324 }
325
42f25756
WB
326 pub fn read_file(&self, file: &CStr) -> io::Result<Vec<u8>> {
327 use io::Read;
328
329 let mut reader = self.open_file(file, libc::O_RDONLY | libc::O_CLOEXEC, 0)?;
330 let mut out = Vec::new();
331 reader.read_to_end(&mut out)?;
332 Ok(out)
333 }
334
512f780a
WB
335 pub fn user_caps(&self) -> Result<UserCaps, Error> {
336 UserCaps::new(self)
337 }
338}
339
340pub struct CGroups {
341 v1: HashMap<String, OsString>,
342 v2: Option<OsString>,
343}
344
345impl CGroups {
346 fn new() -> Self {
347 Self {
348 v1: HashMap::new(),
349 v2: None,
3bb4df0b 350 }
512f780a
WB
351 }
352
353 pub fn get(&self, name: &str) -> Option<&OsStr> {
354 self.v1.get(name).map(|s| s.as_os_str())
355 }
356
357 pub fn v2(&self) -> Option<&OsStr> {
358 self.v2.as_ref().map(|s| s.as_os_str())
359 }
360}
361
362// Too lazy to bindgen libcap stuff...
bd05b957 363const CAPABILITY_VERSION_3: u32 = 0x2008_0522;
512f780a
WB
364
365/// Represents process capabilities.
366///
367/// This can be used to change the process' capability sets (if permitted by the kernel).
368impl Capabilities {
369 // We currently don't implement capget as it takes a pid which is racy on kernels without pidfd
370 // support. Later on we might support a `capget(&PidFd)` method?
371
372 /// Change our process capabilities. This does not include the bounding set.
373 pub fn capset(&self) -> io::Result<()> {
374 #![allow(dead_code)]
375 // kernel abi:
376 struct Header {
377 version: u32,
378 pid: c_int,
379 }
380
381 struct Data {
382 effective: u32,
383 permitted: u32,
384 inheritable: u32,
385 }
386
387 let header = Header {
388 version: CAPABILITY_VERSION_3,
389 pid: 0, // equivalent to gettid(),
390 };
3bb4df0b 391
512f780a
WB
392 let data = [
393 Data {
394 effective: self.effective as u32,
395 permitted: self.permitted as u32,
396 inheritable: self.inheritable as u32,
397 },
398 Data {
399 effective: (self.effective >> 32) as u32,
400 permitted: (self.permitted >> 32) as u32,
401 inheritable: (self.inheritable >> 32) as u32,
402 },
403 ];
404
7ca1a14c 405 c_try!(unsafe { libc::syscall(libc::SYS_capset, &header, &data) });
512f780a
WB
406
407 Ok(())
408 }
512f780a
WB
409}
410
411/// Helper to enter a process' permission-check environment.
412///
413/// When we execute a syscall on behalf of another process, we should try to trigger as many
414/// permission checks as we can. It is impractical to implement them all manually, so the best
415/// thing to do is cause as many of them to happen on the kernel-side as we can.
416///
bff40ab9
WB
417/// We start by entering the process' devices and v2 cgroup. As calls like `mknod()` may be
418/// affected, and access to devices as well.
419///
420/// Then we must enter the mount namespace, chroot and current working directory, in order to get
942f3c73 421/// the correct view of paths.
bff40ab9
WB
422///
423/// Next we copy the caller's `umask`.
512f780a 424///
bff40ab9
WB
425/// Then switch over our effective and file system uid and gid. This has 2 reasons: First, it means
426/// we do not need to run `chown()` on files we create, secondly, the user may have dropped
512f780a
WB
427/// `CAP_DAC_OVERRIDE` / `CAP_DAC_READ_SEARCH` which may have prevented the creation of the file in
428/// the first place (for example, the container program may be a non-root executable with
429/// `cap_mknod=ep` as file-capabilities, in which case we do not want a user to be allowed to run
bff40ab9 430/// `mknod()` on a path owned by different user (and checking file system permissions would
512f780a 431/// require us to handle ACLs, quotas, which are all file system tyep dependent as well, so better
bff40ab9 432/// leave all that up to the kernel, too!)).
512f780a 433///
bff40ab9
WB
434/// Next we clone the process' capability set. This is because the process may have dropped
435/// capabilties which under normal conditions would prevent them from executing the syscall. For
436/// example a process may be executing `mknod()` after having dropped `CAP_MKNOD`.
512f780a
WB
437#[derive(Clone)]
438#[must_use = "not using UserCaps may be a security issue"]
bff40ab9
WB
439pub struct UserCaps<'a> {
440 pidfd: &'a PidFd,
7470f14b 441 apply_uids: bool,
512f780a
WB
442 euid: libc::uid_t,
443 egid: libc::gid_t,
444 fsuid: libc::uid_t,
445 fsgid: libc::gid_t,
446 capabilities: Capabilities,
447 umask: libc::mode_t,
448 cgroup_v1_devices: Option<OsString>,
449 cgroup_v2: Option<OsString>,
42f25756 450 apparmor_profile: Option<OsString>,
512f780a
WB
451}
452
bff40ab9 453impl UserCaps<'_> {
512f780a
WB
454 pub fn new(pidfd: &PidFd) -> Result<UserCaps, Error> {
455 let status = pidfd.get_status()?;
456 let cgroups = pidfd.get_cgroups()?;
42f25756 457 let apparmor_profile = crate::apparmor::get_label(pidfd)?;
512f780a
WB
458
459 Ok(UserCaps {
bff40ab9 460 pidfd,
7470f14b 461 apply_uids: true,
512f780a
WB
462 euid: status.uids.euid,
463 egid: status.uids.egid,
464 fsuid: status.uids.fsuid,
465 fsgid: status.uids.fsgid,
466 capabilities: status.capabilities,
467 umask: status.umask,
468 cgroup_v1_devices: cgroups.get("devices").map(|s| s.to_owned()),
469 cgroup_v2: cgroups.v2().map(|s| s.to_owned()),
42f25756 470 apparmor_profile,
512f780a
WB
471 })
472 }
473
bff40ab9 474 fn apply_cgroups(&self) -> io::Result<()> {
512f780a
WB
475 fn enter_cgroup(kind: &str, name: &OsStr) -> io::Result<()> {
476 let mut path = OsString::with_capacity(15 + kind.len() + name.len() + 13 + 1);
477 path.push(OsStr::from_bytes(b"/sys/fs/cgroup/"));
478 path.push(kind);
479 path.push(name);
480 path.push(OsStr::from_bytes(b"/cgroup.procs"));
481 std::fs::write(path, b"0")
482 }
483
484 if let Some(ref cg) = self.cgroup_v1_devices {
485 enter_cgroup("devices/", cg)?;
486 }
487
488 if let Some(ref cg) = self.cgroup_v2 {
489 enter_cgroup("unified/", cg)?;
490 }
491
492 Ok(())
493 }
494
bff40ab9 495 fn apply_user_caps(&self) -> io::Result<()> {
738dbfbe 496 use crate::capability::SecureBits;
7470f14b
WB
497 if self.apply_uids {
498 unsafe {
499 libc::umask(self.umask);
500 }
501 let mut secbits = SecureBits::get_current()?;
502 secbits |= SecureBits::KEEP_CAPS | SecureBits::NO_SETUID_FIXUP;
503 secbits.apply()?;
504 c_try!(unsafe { libc::setegid(self.egid) });
505 c_try!(unsafe { libc::setfsgid(self.fsgid) });
506 c_try!(unsafe { libc::seteuid(self.euid) });
507 c_try!(unsafe { libc::setfsuid(self.fsuid) });
512f780a 508 }
512f780a
WB
509 self.capabilities.capset()?;
510 Ok(())
3bb4df0b 511 }
bff40ab9 512
7470f14b
WB
513 pub fn disable_uid_change(&mut self) {
514 self.apply_uids = false;
515 }
516
517 pub fn disable_cgroup_change(&mut self) {
518 self.cgroup_v1_devices = None;
519 self.cgroup_v2 = None;
520 }
521
42f25756 522 pub fn apply(self, own_pidfd: &PidFd) -> io::Result<()> {
bff40ab9
WB
523 self.apply_cgroups()?;
524 self.pidfd.mount_namespace()?.setns()?;
525 self.pidfd.enter_chroot()?;
526 self.pidfd.enter_cwd()?;
42f25756
WB
527 if let Some(ref label) = self.apparmor_profile {
528 crate::apparmor::set_label(own_pidfd, label)?;
529 }
bff40ab9
WB
530 self.apply_user_caps()?;
531 Ok(())
532 }
e420f6f9 533}